mirror of
https://github.com/cemu-project/Cemu.git
synced 2024-11-23 09:39:18 +01:00
2013 lines
86 KiB
C
2013 lines
86 KiB
C
|
/******************************************************************************
|
||
|
*
|
||
|
* Copyright (C) 2015 The Android Open Source Project
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at:
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*
|
||
|
*****************************************************************************
|
||
|
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
||
|
*/
|
||
|
/*****************************************************************************/
|
||
|
/* */
|
||
|
/* File Name : ih264_deblk_luma_ssse3.c */
|
||
|
/* */
|
||
|
/* Description : Contains function definitions for deblocking */
|
||
|
/* */
|
||
|
/* List of Functions : ih264_deblk_luma_vert_bs4_ssse3() */
|
||
|
/* ih264_deblk_luma_horz_bs4_ssse3() */
|
||
|
/* ih264_deblk_luma_vert_bslt4_ssse3() */
|
||
|
/* ih264_deblk_luma_horz_bslt4_ssse3() */
|
||
|
/* ih264_deblk_luma_vert_bs4_mbaff_ssse3() */
|
||
|
/* ih264_deblk_luma_vert_bslt4_mbaff_ssse3() */
|
||
|
/* */
|
||
|
/* Issues / Problems : None */
|
||
|
/* */
|
||
|
/* Revision History : */
|
||
|
/* */
|
||
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
/* 12 02 2015 Naveen Kumar P Added luma deblocking ssse3 */
|
||
|
/* intrinsics */
|
||
|
/* */
|
||
|
/*****************************************************************************/
|
||
|
|
||
|
/*****************************************************************************/
|
||
|
/* File Includes */
|
||
|
/*****************************************************************************/
|
||
|
|
||
|
/* System include files */
|
||
|
#include <stdio.h>
|
||
|
|
||
|
/* User include files */
|
||
|
#include "ih264_typedefs.h"
|
||
|
#include "ih264_platform_macros.h"
|
||
|
#include "ih264_deblk_edge_filters.h"
|
||
|
#include "ih264_macros.h"
|
||
|
|
||
|
/*****************************************************************************/
|
||
|
/* Function Definitions */
|
||
|
/*****************************************************************************/
|
||
|
|
||
|
/*****************************************************************************/
|
||
|
/* */
|
||
|
/* Function Name : ih264_deblk_luma_vert_bs4_ssse3() */
|
||
|
/* */
|
||
|
/* Description : This function performs filtering of a luma block */
|
||
|
/* vertical edge when the boundary strength is set to 4. */
|
||
|
/* */
|
||
|
/* Inputs : pu1_src - pointer to the src sample q0 */
|
||
|
/* src_strd - source stride */
|
||
|
/* alpha - alpha value for the boundary */
|
||
|
/* beta - beta value for the boundary */
|
||
|
/* */
|
||
|
/* Globals : None */
|
||
|
/* */
|
||
|
/* Processing : This operation is described in Sec. 8.7.2.4 under the */
|
||
|
/* title "Filtering process for edges for bS equal to 4" in */
|
||
|
/* ITU T Rec H.264. */
|
||
|
/* */
|
||
|
/* Outputs : None */
|
||
|
/* */
|
||
|
/* Returns : None */
|
||
|
/* */
|
||
|
/* Issues : None */
|
||
|
/* */
|
||
|
/* Revision History: */
|
||
|
/* */
|
||
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
/* 12 02 2015 Naveen Kumar P Initial version */
|
||
|
/* */
|
||
|
/*****************************************************************************/
|
||
|
void ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src,
|
||
|
WORD32 src_strd,
|
||
|
WORD32 alpha,
|
||
|
WORD32 beta)
|
||
|
{
|
||
|
__m128i zero = _mm_setzero_si128();
|
||
|
__m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
|
||
|
__m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
|
||
|
__m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
|
||
|
__m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
|
||
|
__m128i q0_16x8_1;
|
||
|
__m128i p0_16x8_1;
|
||
|
__m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
|
||
|
__m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
|
||
|
__m128i temp1, temp2, temp3, temp4, temp5, temp6;
|
||
|
__m128i Alpha_8x16, Beta_8x16;
|
||
|
__m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
|
||
|
__m128i const_val2_16x8 = _mm_set1_epi16(2);
|
||
|
__m128i line1, line2, line3, line4, line5, line6, line7, line8;
|
||
|
|
||
|
Alpha_8x16 = _mm_set1_epi16(alpha);
|
||
|
Beta_8x16 = _mm_set1_epi16(beta);
|
||
|
|
||
|
line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
|
||
|
line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
|
||
|
line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
|
||
|
line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
|
||
|
line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
|
||
|
line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
|
||
|
line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
|
||
|
line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
|
||
|
|
||
|
temp1 = _mm_unpacklo_epi8(line1, line2);
|
||
|
temp2 = _mm_unpacklo_epi8(line3, line4);
|
||
|
temp3 = _mm_unpacklo_epi8(line5, line6);
|
||
|
temp4 = _mm_unpacklo_epi8(line7, line8);
|
||
|
|
||
|
line1 = _mm_unpacklo_epi16(temp1, temp2);
|
||
|
line2 = _mm_unpackhi_epi16(temp1, temp2);
|
||
|
line3 = _mm_unpacklo_epi16(temp3, temp4);
|
||
|
line4 = _mm_unpackhi_epi16(temp3, temp4);
|
||
|
|
||
|
p1_8x16 = _mm_unpacklo_epi32(line1, line3);
|
||
|
p0_8x16 = _mm_unpackhi_epi32(line1, line3);
|
||
|
q0_8x16 = _mm_unpacklo_epi32(line2, line4);
|
||
|
q1_8x16 = _mm_unpackhi_epi32(line2, line4);
|
||
|
|
||
|
line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd));
|
||
|
line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd));
|
||
|
line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd));
|
||
|
line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd));
|
||
|
line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd));
|
||
|
line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd));
|
||
|
line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd));
|
||
|
line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd));
|
||
|
|
||
|
temp1 = _mm_unpacklo_epi8(line1, line2);
|
||
|
temp2 = _mm_unpacklo_epi8(line3, line4);
|
||
|
temp3 = _mm_unpacklo_epi8(line5, line6);
|
||
|
temp4 = _mm_unpacklo_epi8(line7, line8);
|
||
|
|
||
|
line1 = _mm_unpacklo_epi16(temp1, temp2);
|
||
|
line2 = _mm_unpackhi_epi16(temp1, temp2);
|
||
|
line3 = _mm_unpacklo_epi16(temp3, temp4);
|
||
|
line4 = _mm_unpackhi_epi16(temp3, temp4);
|
||
|
|
||
|
temp1 = _mm_unpacklo_epi32(line1, line3);
|
||
|
temp2 = _mm_unpackhi_epi32(line1, line3);
|
||
|
temp3 = _mm_unpacklo_epi32(line2, line4);
|
||
|
temp4 = _mm_unpackhi_epi32(line2, line4);
|
||
|
|
||
|
p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1);
|
||
|
p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1);
|
||
|
q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4);
|
||
|
q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4);
|
||
|
p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2);
|
||
|
p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2);
|
||
|
q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3);
|
||
|
q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3);
|
||
|
|
||
|
//Cond1 (ABS(p0 - q0) < alpha)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
|
||
|
temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
|
||
|
temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
|
||
|
|
||
|
flag1_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
|
||
|
//Cond2 (ABS(q1 - q0) < beta)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
|
||
|
temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
|
||
|
flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
//Cond3 (ABS(p1 - p0) < beta)
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
|
||
|
temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
|
||
|
// !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
|
||
|
flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
// (ABS(p0 - q0) < ((alpha >> 2) + 2))
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
|
||
|
temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
|
||
|
Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
// (ABS(p2 - p0) < beta)
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
|
||
|
temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag3_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
|
||
|
|
||
|
// (ABS(q2 - q0) < beta)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
|
||
|
temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag4_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
|
||
|
|
||
|
// First 8 pixels
|
||
|
p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
|
||
|
p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
|
||
|
p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
|
||
|
p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
|
||
|
q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
|
||
|
q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
|
||
|
q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
|
||
|
q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
|
||
|
|
||
|
// p0_1 and q0_1
|
||
|
temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
|
||
|
temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
|
||
|
temp5 = _mm_add_epi16(temp1, const_val2_16x8);
|
||
|
temp6 = _mm_add_epi16(temp2, const_val2_16x8);
|
||
|
temp3 = _mm_slli_epi16(p1_8x16, 1);
|
||
|
temp4 = _mm_slli_epi16(q1_8x16, 1);
|
||
|
temp1 = _mm_add_epi16(temp5, temp3);
|
||
|
temp2 = _mm_add_epi16(temp6, temp4);
|
||
|
p0_16x8_1 = _mm_srai_epi16(temp1, 2);
|
||
|
q0_16x8_1 = _mm_srai_epi16(temp2, 2);
|
||
|
|
||
|
// p1_2 and q1_2
|
||
|
temp6 = _mm_add_epi16(temp6, p0_8x16);
|
||
|
temp5 = _mm_add_epi16(temp5, q0_8x16);
|
||
|
temp1 = _mm_add_epi16(temp6, p2_8x16);
|
||
|
temp2 = _mm_add_epi16(temp5, q2_8x16);
|
||
|
p1_16x8_2 = _mm_srai_epi16(temp1, 2);
|
||
|
q1_16x8_2 = _mm_srai_epi16(temp2, 2);
|
||
|
|
||
|
// p0_2 and q0_2
|
||
|
temp1 = _mm_add_epi16(temp3, p2_8x16);
|
||
|
temp2 = _mm_add_epi16(temp4, q2_8x16);
|
||
|
temp1 = _mm_add_epi16(temp1, q1_8x16);
|
||
|
temp2 = _mm_add_epi16(temp2, p1_8x16);
|
||
|
temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
|
||
|
temp3 = _mm_slli_epi16(temp3, 1);
|
||
|
temp1 = _mm_add_epi16(temp1, temp3);
|
||
|
temp2 = _mm_add_epi16(temp2, temp3);
|
||
|
temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
|
||
|
temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
|
||
|
p0_16x8_2 = _mm_srai_epi16(temp1, 3);
|
||
|
q0_16x8_2 = _mm_srai_epi16(temp2, 3);
|
||
|
|
||
|
// p2_2 and q2_2
|
||
|
temp1 = _mm_add_epi16(temp6, const_val2_16x8);
|
||
|
temp2 = _mm_add_epi16(temp5, const_val2_16x8);
|
||
|
temp3 = _mm_slli_epi16(p2_8x16, 1);
|
||
|
temp4 = _mm_slli_epi16(q2_8x16, 1);
|
||
|
temp3 = _mm_add_epi16(p2_8x16, temp3);
|
||
|
temp4 = _mm_add_epi16(q2_8x16, temp4);
|
||
|
temp5 = _mm_slli_epi16(p3_8x16, 1);
|
||
|
temp6 = _mm_slli_epi16(q3_8x16, 1);
|
||
|
temp1 = _mm_add_epi16(temp1, temp3);
|
||
|
temp2 = _mm_add_epi16(temp2, temp4);
|
||
|
temp1 = _mm_add_epi16(temp1, temp5);
|
||
|
temp2 = _mm_add_epi16(temp2, temp6);
|
||
|
p2_16x8_2 = _mm_srai_epi16(temp1, 3);
|
||
|
q2_16x8_2 = _mm_srai_epi16(temp2, 3);
|
||
|
|
||
|
// Second 8 pixels and packing with first 8 pixels
|
||
|
p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
|
||
|
p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
|
||
|
p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
|
||
|
p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
|
||
|
q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
|
||
|
q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
|
||
|
q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
|
||
|
q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
|
||
|
|
||
|
// p0_1 and q0_1
|
||
|
temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
|
||
|
temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
|
||
|
temp5 = _mm_add_epi16(temp1, const_val2_16x8);
|
||
|
temp6 = _mm_add_epi16(temp2, const_val2_16x8);
|
||
|
temp3 = _mm_slli_epi16(p1_8x16, 1);
|
||
|
temp4 = _mm_slli_epi16(q1_8x16, 1);
|
||
|
temp1 = _mm_add_epi16(temp5, temp3);
|
||
|
temp2 = _mm_add_epi16(temp6, temp4);
|
||
|
temp1 = _mm_srai_epi16(temp1, 2);
|
||
|
temp2 = _mm_srai_epi16(temp2, 2);
|
||
|
p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
|
||
|
q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
|
||
|
|
||
|
// p1_2 and q1_2
|
||
|
temp6 = _mm_add_epi16(temp6, p0_8x16);
|
||
|
temp5 = _mm_add_epi16(temp5, q0_8x16);
|
||
|
temp1 = _mm_add_epi16(temp6, p2_8x16);
|
||
|
temp2 = _mm_add_epi16(temp5, q2_8x16);
|
||
|
temp1 = _mm_srai_epi16(temp1, 2);
|
||
|
temp2 = _mm_srai_epi16(temp2, 2);
|
||
|
p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
|
||
|
q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
|
||
|
|
||
|
// p0_2 and q0_2
|
||
|
temp1 = _mm_add_epi16(temp3, p2_8x16);
|
||
|
temp2 = _mm_add_epi16(temp4, q2_8x16);
|
||
|
temp1 = _mm_add_epi16(temp1, q1_8x16);
|
||
|
temp2 = _mm_add_epi16(temp2, p1_8x16);
|
||
|
temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
|
||
|
temp3 = _mm_slli_epi16(temp3, 1);
|
||
|
temp1 = _mm_add_epi16(temp1, temp3);
|
||
|
temp2 = _mm_add_epi16(temp2, temp3);
|
||
|
temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
|
||
|
temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
|
||
|
temp1 = _mm_srai_epi16(temp1, 3);
|
||
|
temp2 = _mm_srai_epi16(temp2, 3);
|
||
|
p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
|
||
|
q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
|
||
|
|
||
|
// p2_2 and q2_2
|
||
|
temp1 = _mm_add_epi16(temp6, const_val2_16x8);
|
||
|
temp2 = _mm_add_epi16(temp5, const_val2_16x8);
|
||
|
temp3 = _mm_slli_epi16(p2_8x16, 1);
|
||
|
temp4 = _mm_slli_epi16(q2_8x16, 1);
|
||
|
temp3 = _mm_add_epi16(p2_8x16, temp3);
|
||
|
temp4 = _mm_add_epi16(q2_8x16, temp4);
|
||
|
temp5 = _mm_slli_epi16(p3_8x16, 1);
|
||
|
temp6 = _mm_slli_epi16(q3_8x16, 1);
|
||
|
temp1 = _mm_add_epi16(temp1, temp3);
|
||
|
temp2 = _mm_add_epi16(temp2, temp4);
|
||
|
temp1 = _mm_add_epi16(temp1, temp5);
|
||
|
temp2 = _mm_add_epi16(temp2, temp6);
|
||
|
temp1 = _mm_srai_epi16(temp1, 3);
|
||
|
temp2 = _mm_srai_epi16(temp2, 3);
|
||
|
p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
|
||
|
q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
|
||
|
|
||
|
// p0 and q0
|
||
|
p0_16x8 = _mm_and_si128(p0_16x8,
|
||
|
_mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
|
||
|
p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
|
||
|
p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
|
||
|
q0_16x8 = _mm_and_si128(q0_16x8,
|
||
|
_mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
|
||
|
q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
|
||
|
q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
|
||
|
|
||
|
// p0 and q0
|
||
|
p0_16x8 = _mm_and_si128(p0_16x8,
|
||
|
_mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
|
||
|
p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
|
||
|
p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
|
||
|
q0_16x8 = _mm_and_si128(q0_16x8,
|
||
|
_mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
|
||
|
q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
|
||
|
q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
|
||
|
|
||
|
// p1 and q1
|
||
|
p1_16x8 = _mm_and_si128(p1_16x8,
|
||
|
_mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
|
||
|
p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
|
||
|
p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
|
||
|
q1_16x8 = _mm_and_si128(q1_16x8,
|
||
|
_mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
|
||
|
q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
|
||
|
q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
|
||
|
|
||
|
// p2 and q2
|
||
|
p2_16x8 = _mm_and_si128(p2_16x8,
|
||
|
_mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
|
||
|
p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
|
||
|
p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
|
||
|
q2_16x8 = _mm_and_si128(q2_16x8,
|
||
|
_mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
|
||
|
q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
|
||
|
q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
|
||
|
|
||
|
temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
|
||
|
temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
|
||
|
temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
|
||
|
temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
|
||
|
|
||
|
p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
|
||
|
p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
|
||
|
q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
|
||
|
q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
|
||
|
|
||
|
line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
|
||
|
line2 = _mm_srli_si128(line1, 8);
|
||
|
line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
|
||
|
line4 = _mm_srli_si128(line3, 8);
|
||
|
line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
|
||
|
line6 = _mm_srli_si128(line5, 8);
|
||
|
line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
|
||
|
line8 = _mm_srli_si128(line7, 8);
|
||
|
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
|
||
|
|
||
|
temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8);
|
||
|
temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8);
|
||
|
temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8);
|
||
|
temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8);
|
||
|
|
||
|
p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
|
||
|
p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
|
||
|
q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
|
||
|
q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
|
||
|
|
||
|
line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
|
||
|
line2 = _mm_srli_si128(line1, 8);
|
||
|
line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
|
||
|
line4 = _mm_srli_si128(line3, 8);
|
||
|
line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
|
||
|
line6 = _mm_srli_si128(line5, 8);
|
||
|
line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
|
||
|
line8 = _mm_srli_si128(line7, 8);
|
||
|
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8);
|
||
|
|
||
|
}
|
||
|
|
||
|
/*****************************************************************************/
|
||
|
/* */
|
||
|
/* Function Name : ih264_deblk_luma_horz_bs4_ssse3() */
|
||
|
/* */
|
||
|
/* Description : This function performs filtering of a luma block */
|
||
|
/* horizontal edge when the boundary strength is set to 4. */
|
||
|
/* */
|
||
|
/* Inputs : pu1_src - pointer to the src sample q0 */
|
||
|
/* src_strd - source stride */
|
||
|
/* alpha - alpha value for the boundary */
|
||
|
/* beta - beta value for the boundary */
|
||
|
/* */
|
||
|
/* Globals : None */
|
||
|
/* */
|
||
|
/* Processing : This operation is described in Sec. 8.7.2.4 under the */
|
||
|
/* title "Filtering process for edges for bS equal to 4" in */
|
||
|
/* ITU T Rec H.264. */
|
||
|
/* */
|
||
|
/* Outputs : None */
|
||
|
/* */
|
||
|
/* Returns : None */
|
||
|
/* */
|
||
|
/* Issues : None */
|
||
|
/* */
|
||
|
/* Revision History: */
|
||
|
/* */
|
||
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
/* 12 02 2015 Naveen Kumar P Initial version */
|
||
|
/* */
|
||
|
/*****************************************************************************/
|
||
|
void ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src,
|
||
|
WORD32 src_strd,
|
||
|
WORD32 alpha,
|
||
|
WORD32 beta)
|
||
|
{
|
||
|
WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0;
|
||
|
WORD16 i16_posQ1, i16_posQ2, i16_posQ3;
|
||
|
UWORD8 *pu1_HorzPixel;
|
||
|
__m128i zero = _mm_setzero_si128();
|
||
|
__m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
|
||
|
__m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
|
||
|
__m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
|
||
|
__m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
|
||
|
__m128i q0_16x8_1;
|
||
|
__m128i p0_16x8_1;
|
||
|
__m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
|
||
|
__m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
|
||
|
__m128i temp1, temp2, temp3, temp4, temp5, temp6;
|
||
|
__m128i Alpha_8x16, Beta_8x16;
|
||
|
__m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
|
||
|
__m128i const_val2_16x8 = _mm_set1_epi16(2);
|
||
|
|
||
|
pu1_HorzPixel = pu1_src - (src_strd << 2);
|
||
|
|
||
|
i16_posQ1 = src_strd;
|
||
|
i16_posQ2 = X2(src_strd);
|
||
|
i16_posQ3 = X3(src_strd);
|
||
|
i16_posP0 = X3(src_strd);
|
||
|
i16_posP1 = X2(src_strd);
|
||
|
i16_posP2 = src_strd;
|
||
|
i16_posP3 = 0;
|
||
|
|
||
|
Alpha_8x16 = _mm_set1_epi16(alpha);
|
||
|
Beta_8x16 = _mm_set1_epi16(beta);
|
||
|
|
||
|
p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3));
|
||
|
p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
|
||
|
p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
|
||
|
p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
|
||
|
q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
|
||
|
q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
|
||
|
q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
|
||
|
q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3));
|
||
|
|
||
|
//Cond1 (ABS(p0 - q0) < alpha)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
|
||
|
temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
|
||
|
temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
|
||
|
|
||
|
flag1_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
|
||
|
//Cond2 (ABS(q1 - q0) < beta)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
|
||
|
temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
|
||
|
flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
//Cond3 (ABS(p1 - p0) < beta)
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
|
||
|
temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
|
||
|
// !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
|
||
|
flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
// (ABS(p0 - q0) < ((alpha >> 2) + 2))
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
|
||
|
temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
|
||
|
Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
// (ABS(p2 - p0) < beta)
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
|
||
|
temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag3_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
|
||
|
|
||
|
// (ABS(q2 - q0) < beta)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
|
||
|
temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag4_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
|
||
|
|
||
|
// First 8 pixels
|
||
|
p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
|
||
|
p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
|
||
|
p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
|
||
|
p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
|
||
|
q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
|
||
|
q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
|
||
|
q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
|
||
|
q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
|
||
|
|
||
|
// p0_1 and q0_1
|
||
|
temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
|
||
|
temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
|
||
|
temp5 = _mm_add_epi16(temp1, const_val2_16x8);
|
||
|
temp6 = _mm_add_epi16(temp2, const_val2_16x8);
|
||
|
temp3 = _mm_slli_epi16(p1_8x16, 1);
|
||
|
temp4 = _mm_slli_epi16(q1_8x16, 1);
|
||
|
temp1 = _mm_add_epi16(temp5, temp3);
|
||
|
temp2 = _mm_add_epi16(temp6, temp4);
|
||
|
p0_16x8_1 = _mm_srai_epi16(temp1, 2);
|
||
|
q0_16x8_1 = _mm_srai_epi16(temp2, 2);
|
||
|
|
||
|
// p1_2 and q1_2
|
||
|
temp6 = _mm_add_epi16(temp6, p0_8x16);
|
||
|
temp5 = _mm_add_epi16(temp5, q0_8x16);
|
||
|
temp1 = _mm_add_epi16(temp6, p2_8x16);
|
||
|
temp2 = _mm_add_epi16(temp5, q2_8x16);
|
||
|
p1_16x8_2 = _mm_srai_epi16(temp1, 2);
|
||
|
q1_16x8_2 = _mm_srai_epi16(temp2, 2);
|
||
|
|
||
|
// p0_2 and q0_2
|
||
|
temp1 = _mm_add_epi16(temp3, p2_8x16);
|
||
|
temp2 = _mm_add_epi16(temp4, q2_8x16);
|
||
|
temp1 = _mm_add_epi16(temp1, q1_8x16);
|
||
|
temp2 = _mm_add_epi16(temp2, p1_8x16);
|
||
|
temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
|
||
|
temp3 = _mm_slli_epi16(temp3, 1);
|
||
|
temp1 = _mm_add_epi16(temp1, temp3);
|
||
|
temp2 = _mm_add_epi16(temp2, temp3);
|
||
|
temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
|
||
|
temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
|
||
|
p0_16x8_2 = _mm_srai_epi16(temp1, 3);
|
||
|
q0_16x8_2 = _mm_srai_epi16(temp2, 3);
|
||
|
|
||
|
// p2_2 and q2_2
|
||
|
temp1 = _mm_add_epi16(temp6, const_val2_16x8);
|
||
|
temp2 = _mm_add_epi16(temp5, const_val2_16x8);
|
||
|
temp3 = _mm_slli_epi16(p2_8x16, 1);
|
||
|
temp4 = _mm_slli_epi16(q2_8x16, 1);
|
||
|
temp3 = _mm_add_epi16(p2_8x16, temp3);
|
||
|
temp4 = _mm_add_epi16(q2_8x16, temp4);
|
||
|
temp5 = _mm_slli_epi16(p3_8x16, 1);
|
||
|
temp6 = _mm_slli_epi16(q3_8x16, 1);
|
||
|
temp1 = _mm_add_epi16(temp1, temp3);
|
||
|
temp2 = _mm_add_epi16(temp2, temp4);
|
||
|
temp1 = _mm_add_epi16(temp1, temp5);
|
||
|
temp2 = _mm_add_epi16(temp2, temp6);
|
||
|
p2_16x8_2 = _mm_srai_epi16(temp1, 3);
|
||
|
q2_16x8_2 = _mm_srai_epi16(temp2, 3);
|
||
|
|
||
|
// Second 8 pixels and packing with first 8 pixels
|
||
|
p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
|
||
|
p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
|
||
|
p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
|
||
|
p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
|
||
|
q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
|
||
|
q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
|
||
|
q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
|
||
|
q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
|
||
|
|
||
|
// p0_1 and q0_1
|
||
|
temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
|
||
|
temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
|
||
|
temp5 = _mm_add_epi16(temp1, const_val2_16x8);
|
||
|
temp6 = _mm_add_epi16(temp2, const_val2_16x8);
|
||
|
temp3 = _mm_slli_epi16(p1_8x16, 1);
|
||
|
temp4 = _mm_slli_epi16(q1_8x16, 1);
|
||
|
temp1 = _mm_add_epi16(temp5, temp3);
|
||
|
temp2 = _mm_add_epi16(temp6, temp4);
|
||
|
temp1 = _mm_srai_epi16(temp1, 2);
|
||
|
temp2 = _mm_srai_epi16(temp2, 2);
|
||
|
p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
|
||
|
q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
|
||
|
|
||
|
// p1_2 and q1_2
|
||
|
temp6 = _mm_add_epi16(temp6, p0_8x16);
|
||
|
temp5 = _mm_add_epi16(temp5, q0_8x16);
|
||
|
temp1 = _mm_add_epi16(temp6, p2_8x16);
|
||
|
temp2 = _mm_add_epi16(temp5, q2_8x16);
|
||
|
temp1 = _mm_srai_epi16(temp1, 2);
|
||
|
temp2 = _mm_srai_epi16(temp2, 2);
|
||
|
p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
|
||
|
q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
|
||
|
|
||
|
// p0_2 and q0_2
|
||
|
temp1 = _mm_add_epi16(temp3, p2_8x16);
|
||
|
temp2 = _mm_add_epi16(temp4, q2_8x16);
|
||
|
temp1 = _mm_add_epi16(temp1, q1_8x16);
|
||
|
temp2 = _mm_add_epi16(temp2, p1_8x16);
|
||
|
temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
|
||
|
temp3 = _mm_slli_epi16(temp3, 1);
|
||
|
temp1 = _mm_add_epi16(temp1, temp3);
|
||
|
temp2 = _mm_add_epi16(temp2, temp3);
|
||
|
temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
|
||
|
temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
|
||
|
temp1 = _mm_srai_epi16(temp1, 3);
|
||
|
temp2 = _mm_srai_epi16(temp2, 3);
|
||
|
p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
|
||
|
q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
|
||
|
|
||
|
// p2_2 and q2_2
|
||
|
temp1 = _mm_add_epi16(temp6, const_val2_16x8);
|
||
|
temp2 = _mm_add_epi16(temp5, const_val2_16x8);
|
||
|
temp3 = _mm_slli_epi16(p2_8x16, 1);
|
||
|
temp4 = _mm_slli_epi16(q2_8x16, 1);
|
||
|
temp3 = _mm_add_epi16(p2_8x16, temp3);
|
||
|
temp4 = _mm_add_epi16(q2_8x16, temp4);
|
||
|
temp5 = _mm_slli_epi16(p3_8x16, 1);
|
||
|
temp6 = _mm_slli_epi16(q3_8x16, 1);
|
||
|
temp1 = _mm_add_epi16(temp1, temp3);
|
||
|
temp2 = _mm_add_epi16(temp2, temp4);
|
||
|
temp1 = _mm_add_epi16(temp1, temp5);
|
||
|
temp2 = _mm_add_epi16(temp2, temp6);
|
||
|
temp1 = _mm_srai_epi16(temp1, 3);
|
||
|
temp2 = _mm_srai_epi16(temp2, 3);
|
||
|
p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
|
||
|
q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
|
||
|
|
||
|
// p0 and q0
|
||
|
p0_16x8 = _mm_and_si128(p0_16x8,
|
||
|
_mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
|
||
|
p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
|
||
|
p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
|
||
|
q0_16x8 = _mm_and_si128(q0_16x8,
|
||
|
_mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
|
||
|
q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
|
||
|
q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
|
||
|
|
||
|
// p0 and q0
|
||
|
p0_16x8 = _mm_and_si128(p0_16x8,
|
||
|
_mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
|
||
|
p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
|
||
|
p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
|
||
|
q0_16x8 = _mm_and_si128(q0_16x8,
|
||
|
_mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
|
||
|
q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
|
||
|
q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
|
||
|
|
||
|
// p1 and q1
|
||
|
p1_16x8 = _mm_and_si128(p1_16x8,
|
||
|
_mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
|
||
|
p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
|
||
|
p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
|
||
|
q1_16x8 = _mm_and_si128(q1_16x8,
|
||
|
_mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
|
||
|
q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
|
||
|
q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
|
||
|
|
||
|
// p2 and q2
|
||
|
p2_16x8 = _mm_and_si128(p2_16x8,
|
||
|
_mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
|
||
|
p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
|
||
|
p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
|
||
|
q2_16x8 = _mm_and_si128(q2_16x8,
|
||
|
_mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
|
||
|
q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
|
||
|
q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
|
||
|
|
||
|
_mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP2), p2_16x8);
|
||
|
_mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), p1_16x8);
|
||
|
_mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), p0_16x8);
|
||
|
|
||
|
_mm_storeu_si128((__m128i *)(pu1_src), q0_16x8);
|
||
|
_mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), q1_16x8);
|
||
|
_mm_storeu_si128((__m128i *)(pu1_src + i16_posQ2), q2_16x8);
|
||
|
|
||
|
}
|
||
|
|
||
|
/*****************************************************************************/
|
||
|
/* */
|
||
|
/* Function Name : ih264_deblk_luma_vert_bslt4_ssse3() */
|
||
|
/* */
|
||
|
/* Description : This function performs filtering of a luma block */
|
||
|
/* vertical edge when the boundary strength is less than 4. */
|
||
|
/* */
|
||
|
/* Inputs : pu1_src - pointer to the src sample q0 */
|
||
|
/* src_strd - source stride */
|
||
|
/* alpha - alpha value for the boundary */
|
||
|
/* beta - beta value for the boundary */
|
||
|
/* u4_bs - packed Boundary strength array */
|
||
|
/* pu1_cliptab - tc0_table */
|
||
|
/* */
|
||
|
/* Globals : None */
|
||
|
/* */
|
||
|
/* Processing : This operation is described in Sec. 8.7.2.3 under the */
|
||
|
/* title "Filtering process for edges for bS less than 4" */
|
||
|
/* in ITU T Rec H.264. */
|
||
|
/* */
|
||
|
/* Outputs : None */
|
||
|
/* */
|
||
|
/* Returns : None */
|
||
|
/* */
|
||
|
/* Issues : None */
|
||
|
/* */
|
||
|
/* Revision History: */
|
||
|
/* */
|
||
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
/* 12 02 2015 Naveen Kumar P Initial version */
|
||
|
/* */
|
||
|
/*****************************************************************************/
|
||
|
void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src,
|
||
|
WORD32 src_strd,
|
||
|
WORD32 alpha,
|
||
|
WORD32 beta,
|
||
|
UWORD32 u4_bs,
|
||
|
const UWORD8 *pu1_cliptab)
|
||
|
{
|
||
|
UWORD8 u1_Bs, u1_Bs1;
|
||
|
|
||
|
WORD32 j = 0;
|
||
|
|
||
|
__m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
|
||
|
__m128i int1, int2, int3, int4, high1, high2;
|
||
|
__m128i flag, flag1, i_C, i_C0;
|
||
|
__m128i i_Ap, i_Aq, diff, const1, const2, in_macro, in_macrotemp, temp,
|
||
|
temp1;
|
||
|
__m128i zero = _mm_setzero_si128();
|
||
|
|
||
|
for(j = 0; j <= 8 * src_strd; j += 8 * src_strd)
|
||
|
{
|
||
|
//Transpose
|
||
|
linea = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + j));
|
||
|
lineb = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + src_strd + j));
|
||
|
linec = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j));
|
||
|
lined = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j));
|
||
|
|
||
|
linea = _mm_unpacklo_epi8(linea, zero);
|
||
|
lineb = _mm_unpacklo_epi8(lineb, zero);
|
||
|
linec = _mm_unpacklo_epi8(linec, zero);
|
||
|
lined = _mm_unpacklo_epi8(lined, zero);
|
||
|
|
||
|
int1 = _mm_unpacklo_epi16(linea, lineb);
|
||
|
lineb = _mm_unpackhi_epi16(linea, lineb);
|
||
|
|
||
|
int2 = _mm_unpacklo_epi16(linec, lined);
|
||
|
lined = _mm_unpackhi_epi16(linec, lined);
|
||
|
|
||
|
linea = _mm_unpacklo_epi16(int1, int2);
|
||
|
int1 = _mm_unpackhi_epi16(int1, int2);
|
||
|
|
||
|
linec = _mm_unpacklo_epi16(lineb, lined);
|
||
|
high1 = _mm_unpackhi_epi16(lineb, lined);
|
||
|
|
||
|
linee = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j));
|
||
|
linef = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j));
|
||
|
lineg = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j));
|
||
|
lineh = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j));
|
||
|
|
||
|
linee = _mm_unpacklo_epi8(linee, zero);
|
||
|
linef = _mm_unpacklo_epi8(linef, zero);
|
||
|
lineg = _mm_unpacklo_epi8(lineg, zero);
|
||
|
lineh = _mm_unpacklo_epi8(lineh, zero);
|
||
|
|
||
|
int2 = _mm_unpacklo_epi16(linee, linef);
|
||
|
linef = _mm_unpackhi_epi16(linee, linef);
|
||
|
|
||
|
int3 = _mm_unpacklo_epi16(lineg, lineh);
|
||
|
lineh = _mm_unpackhi_epi16(lineg, lineh);
|
||
|
|
||
|
linee = _mm_unpacklo_epi16(int2, int3);
|
||
|
int2 = _mm_unpackhi_epi16(int2, int3);
|
||
|
|
||
|
lineg = _mm_unpacklo_epi16(linef, lineh);
|
||
|
high2 = _mm_unpackhi_epi16(linef, lineh);
|
||
|
|
||
|
int4 = _mm_unpacklo_epi16(linea, linee);
|
||
|
lineb = _mm_unpackhi_epi16(linea, linee);
|
||
|
|
||
|
int3 = _mm_unpacklo_epi16(int1, int2);
|
||
|
lined = _mm_unpackhi_epi16(int1, int2);
|
||
|
|
||
|
int2 = _mm_unpacklo_epi16(linec, lineg);
|
||
|
linef = _mm_unpackhi_epi16(linec, lineg);
|
||
|
|
||
|
linea = int4;
|
||
|
linec = int3;
|
||
|
linee = int2;
|
||
|
|
||
|
lineg = _mm_unpacklo_epi16(high1, high2);
|
||
|
lineh = _mm_unpackhi_epi16(high1, high2);
|
||
|
|
||
|
//end of transpose
|
||
|
|
||
|
u1_Bs = (u4_bs >> 24) & 0xff;
|
||
|
u1_Bs1 = (u4_bs >> 16) & 0xff;
|
||
|
u4_bs <<= 16;
|
||
|
|
||
|
flag1 = _mm_set_epi16(u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, u1_Bs1, u1_Bs,
|
||
|
u1_Bs1, u1_Bs);
|
||
|
flag1 = _mm_cmpeq_epi16(flag1, zero); //Set flag to 1s and 0s
|
||
|
flag1 = _mm_xor_si128(flag1, _mm_set1_epi16(0xFFFF)); //Invert for required mask
|
||
|
|
||
|
i_C0 = _mm_set_epi16(pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
|
||
|
pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
|
||
|
pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
|
||
|
pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs]);
|
||
|
|
||
|
diff = _mm_subs_epi16(linec, lined); //Condn 1
|
||
|
diff = _mm_abs_epi16(diff);
|
||
|
const1 = _mm_set1_epi16(alpha);
|
||
|
flag = _mm_cmpgt_epi16(const1, diff);
|
||
|
|
||
|
diff = _mm_subs_epi16(linee, lined); //Condtn 2
|
||
|
diff = _mm_abs_epi16(diff);
|
||
|
const1 = _mm_set1_epi16(beta);
|
||
|
flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff));
|
||
|
|
||
|
diff = _mm_subs_epi16(lineb, linec); //Condtn 3
|
||
|
diff = _mm_abs_epi16(diff);
|
||
|
flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); //Const 1= Beta from now on
|
||
|
|
||
|
flag = _mm_and_si128(flag, flag1); //Final flag (ui_B condition + other 3 conditions)
|
||
|
|
||
|
//Adding Ap<Beta and Aq<Beta
|
||
|
i_Ap = _mm_subs_epi16(linea, linec);
|
||
|
i_Ap = _mm_abs_epi16(i_Ap);
|
||
|
const2 = _mm_cmpgt_epi16(const1, i_Ap);
|
||
|
const2 = _mm_subs_epi16(zero, const2); //Make FFFF=1 and 0000=0
|
||
|
i_C = _mm_add_epi16(i_C0, const2);
|
||
|
|
||
|
i_Aq = _mm_subs_epi16(linef, lined);
|
||
|
i_Aq = _mm_abs_epi16(i_Aq);
|
||
|
const2 = _mm_cmpgt_epi16(const1, i_Aq);
|
||
|
const2 = _mm_subs_epi16(zero, const2);
|
||
|
i_C = _mm_add_epi16(i_C, const2);
|
||
|
|
||
|
//Calculate in_macro
|
||
|
diff = _mm_subs_epi16(lined, linec);
|
||
|
diff = _mm_slli_epi16(diff, 2);
|
||
|
const2 = _mm_subs_epi16(lineb, linee);
|
||
|
diff = _mm_add_epi16(diff, const2);
|
||
|
const2 = _mm_set1_epi16(4);
|
||
|
diff = _mm_add_epi16(diff, const2);
|
||
|
in_macro = _mm_srai_epi16(diff, 3);
|
||
|
|
||
|
in_macro = _mm_min_epi16(i_C, in_macro); //CLIP3
|
||
|
i_C = _mm_subs_epi16(zero, i_C);
|
||
|
in_macro = _mm_max_epi16(i_C, in_macro);
|
||
|
|
||
|
//Compute and store
|
||
|
in_macrotemp = _mm_add_epi16(linec, in_macro);
|
||
|
in_macrotemp = _mm_and_si128(in_macrotemp, flag);
|
||
|
temp = _mm_and_si128(linec,
|
||
|
_mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
|
||
|
temp = _mm_add_epi16(temp, in_macrotemp);
|
||
|
//temp= _mm_packus_epi16 (temp, zero);
|
||
|
//_mm_storel_epi64(uc_HorzPixel+i16_posP0+i, in_macrotemp);
|
||
|
|
||
|
in_macrotemp = _mm_subs_epi16(lined, in_macro);
|
||
|
in_macrotemp = _mm_and_si128(in_macrotemp, flag);
|
||
|
temp1 = _mm_and_si128(lined,
|
||
|
_mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
|
||
|
temp1 = _mm_add_epi16(temp1, in_macrotemp);
|
||
|
//temp1= _mm_packus_epi16 (temp1, zero);
|
||
|
//_mm_storel_epi64(pu1_src+i, in_macrotemp);
|
||
|
|
||
|
//If Ap<Beta
|
||
|
flag1 = _mm_cmpgt_epi16(const1, i_Ap);
|
||
|
flag1 = _mm_and_si128(flag, flag1);
|
||
|
in_macrotemp = _mm_add_epi16(linec, lined);
|
||
|
in_macrotemp = _mm_add_epi16(in_macrotemp, _mm_set1_epi16(1));
|
||
|
in_macrotemp = _mm_srai_epi16(in_macrotemp, 1);
|
||
|
in_macro = _mm_add_epi16(in_macrotemp, linea);
|
||
|
in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(lineb, 1));
|
||
|
in_macro = _mm_srai_epi16(in_macro, 1);
|
||
|
|
||
|
in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
|
||
|
i_C0 = _mm_subs_epi16(zero, i_C0);
|
||
|
in_macro = _mm_max_epi16(i_C0, in_macro);
|
||
|
|
||
|
in_macro = _mm_and_si128(in_macro, flag1);
|
||
|
lineb = _mm_add_epi16(lineb, in_macro);
|
||
|
//in_macro= _mm_packus_epi16 (i_p1, zero);
|
||
|
//_mm_storel_epi64(uc_HorzPixel+i16_posP1+i, in_macro);
|
||
|
|
||
|
flag1 = _mm_cmpgt_epi16(const1, i_Aq);
|
||
|
flag1 = _mm_and_si128(flag, flag1);
|
||
|
in_macro = _mm_add_epi16(in_macrotemp, linef);
|
||
|
in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(linee, 1));
|
||
|
in_macro = _mm_srai_epi16(in_macro, 1);
|
||
|
|
||
|
i_C0 = _mm_abs_epi16(i_C0);
|
||
|
in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
|
||
|
i_C0 = _mm_subs_epi16(zero, i_C0);
|
||
|
in_macro = _mm_max_epi16(i_C0, in_macro);
|
||
|
|
||
|
in_macro = _mm_and_si128(in_macro, flag1);
|
||
|
linee = _mm_add_epi16(linee, in_macro);
|
||
|
//in_macro= _mm_packus_epi16 (i_q1, zero);
|
||
|
//_mm_storel_epi64(pu1_src+i16_posQ1+i, in_macro);
|
||
|
linec = temp;
|
||
|
lined = temp1;
|
||
|
//End of filtering
|
||
|
|
||
|
int1 = _mm_unpacklo_epi16(linea, linee);
|
||
|
linee = _mm_unpackhi_epi16(linea, linee);
|
||
|
|
||
|
int2 = _mm_unpacklo_epi16(linec, lineg);
|
||
|
lineg = _mm_unpackhi_epi16(linec, lineg);
|
||
|
|
||
|
linea = _mm_unpacklo_epi16(int1, int2);
|
||
|
int3 = _mm_unpackhi_epi16(int1, int2);
|
||
|
|
||
|
linec = _mm_unpacklo_epi16(linee, lineg);
|
||
|
lineg = _mm_unpackhi_epi16(linee, lineg);
|
||
|
|
||
|
int1 = _mm_unpacklo_epi16(lineb, linef);
|
||
|
linef = _mm_unpackhi_epi16(lineb, linef);
|
||
|
|
||
|
int2 = _mm_unpacklo_epi16(lined, lineh);
|
||
|
lineh = _mm_unpackhi_epi16(lined, lineh);
|
||
|
|
||
|
lineb = _mm_unpacklo_epi16(int1, int2);
|
||
|
int4 = _mm_unpackhi_epi16(int1, int2);
|
||
|
|
||
|
lined = _mm_unpacklo_epi16(linef, lineh);
|
||
|
lineh = _mm_unpackhi_epi16(linef, lineh);
|
||
|
|
||
|
int1 = _mm_unpackhi_epi16(linea, lineb);
|
||
|
linea = _mm_unpacklo_epi16(linea, lineb);
|
||
|
|
||
|
int2 = _mm_unpacklo_epi16(int3, int4);
|
||
|
high1 = _mm_unpackhi_epi16(int3, int4);
|
||
|
|
||
|
lineb = _mm_unpacklo_epi16(linec, lined);
|
||
|
linef = _mm_unpackhi_epi16(linec, lined);
|
||
|
|
||
|
lined = _mm_unpacklo_epi16(lineg, lineh);
|
||
|
lineh = _mm_unpackhi_epi16(lineg, lineh);
|
||
|
|
||
|
linee = int1;
|
||
|
lineg = high1;
|
||
|
linec = int2;
|
||
|
//End of inverse transpose
|
||
|
|
||
|
//Packs and stores
|
||
|
linea = _mm_packus_epi16(linea, zero);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 3 + j), linea);
|
||
|
|
||
|
lineb = _mm_packus_epi16(lineb, zero);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 3 + src_strd + j), lineb);
|
||
|
|
||
|
linec = _mm_packus_epi16(linec, zero);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j), linec);
|
||
|
|
||
|
lined = _mm_packus_epi16(lined, zero);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j), lined);
|
||
|
|
||
|
linee = _mm_packus_epi16(linee, zero);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j), linee);
|
||
|
|
||
|
linef = _mm_packus_epi16(linef, zero);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j), linef);
|
||
|
|
||
|
lineg = _mm_packus_epi16(lineg, zero);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j), lineg);
|
||
|
|
||
|
lineh = _mm_packus_epi16(lineh, zero);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j), lineh);
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*****************************************************************************/
|
||
|
/* */
|
||
|
/* Function Name : ih264_deblk_luma_horz_bslt4_ssse3() */
|
||
|
/* */
|
||
|
/* Description : This function performs filtering of a luma block */
|
||
|
/* horizontal edge when boundary strength is less than 4. */
|
||
|
/* */
|
||
|
/* Inputs : pu1_src - pointer to the src sample q0 */
|
||
|
/* src_strd - source stride */
|
||
|
/* alpha - alpha value for the boundary */
|
||
|
/* beta - beta value for the boundary */
|
||
|
/* u4_bs - packed Boundary strength array */
|
||
|
/* pu1_cliptab - tc0_table */
|
||
|
/* */
|
||
|
/* Globals : None */
|
||
|
/* */
|
||
|
/* Processing : This operation is described in Sec. 8.7.2.3 under the */
|
||
|
/* title "Filtering process for edges for bS less than 4" */
|
||
|
/* in ITU T Rec H.264. */
|
||
|
/* */
|
||
|
/* Outputs : None */
|
||
|
/* */
|
||
|
/* Returns : None */
|
||
|
/* */
|
||
|
/* Issues : None */
|
||
|
/* */
|
||
|
/* Revision History: */
|
||
|
/* */
|
||
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
/* 12 02 2015 Naveen Kumar P Initial version */
|
||
|
/* */
|
||
|
/*****************************************************************************/
|
||
|
void ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 *pu1_src,
|
||
|
WORD32 src_strd,
|
||
|
WORD32 alpha,
|
||
|
WORD32 beta,
|
||
|
UWORD32 u4_bs,
|
||
|
const UWORD8 *pu1_cliptab)
|
||
|
{
|
||
|
WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
|
||
|
UWORD8 *pu1_HorzPixel;
|
||
|
__m128i zero = _mm_setzero_si128();
|
||
|
__m128i bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16, C_8x16, C_hi_8x16;
|
||
|
__m128i q0_16x8, q1_16x8, q2_16x8, p0_16x8, p1_16x8, p2_16x8;
|
||
|
__m128i temp1, temp2;
|
||
|
__m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
|
||
|
__m128i in_macro_16x8, in_macro_hi_16x8;
|
||
|
__m128i const_val4_8x16;
|
||
|
UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
|
||
|
UWORD8 clip0, clip1, clip2, clip3;
|
||
|
|
||
|
pu1_HorzPixel = pu1_src - (src_strd << 2);
|
||
|
|
||
|
i16_posQ1 = src_strd;
|
||
|
i16_posQ2 = X2(src_strd);
|
||
|
i16_posP0 = X3(src_strd);
|
||
|
i16_posP1 = X2(src_strd);
|
||
|
i16_posP2 = src_strd;
|
||
|
|
||
|
q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
|
||
|
q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
|
||
|
|
||
|
u1_Bs0 = (u4_bs >> 24) & 0xff;
|
||
|
u1_Bs1 = (u4_bs >> 16) & 0xff;
|
||
|
u1_Bs2 = (u4_bs >> 8) & 0xff;
|
||
|
u1_Bs3 = (u4_bs >> 0) & 0xff;
|
||
|
clip0 = pu1_cliptab[u1_Bs0];
|
||
|
clip1 = pu1_cliptab[u1_Bs1];
|
||
|
clip2 = pu1_cliptab[u1_Bs2];
|
||
|
clip3 = pu1_cliptab[u1_Bs3];
|
||
|
|
||
|
Alpha_8x16 = _mm_set1_epi16(alpha);
|
||
|
Beta_8x16 = _mm_set1_epi16(beta);
|
||
|
|
||
|
bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
|
||
|
u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
|
||
|
u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
|
||
|
|
||
|
C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
|
||
|
clip2, clip1, clip1, clip1, clip1, clip0, clip0,
|
||
|
clip0, clip0);
|
||
|
|
||
|
bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
|
||
|
bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
|
||
|
C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
|
||
|
C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero);
|
||
|
|
||
|
p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
|
||
|
p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
|
||
|
p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
|
||
|
q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
|
||
|
|
||
|
//Cond1 (ABS(p0 - q0) < alpha)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
|
||
|
temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
|
||
|
temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
|
||
|
|
||
|
flag1_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
|
||
|
|
||
|
//Cond2 (ABS(q1 - q0) < beta)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
|
||
|
temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
|
||
|
flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
//Cond3 (ABS(p1 - p0) < beta)
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
|
||
|
temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
|
||
|
// !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
|
||
|
flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
// (ABS(p2 - p0) < beta)
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
|
||
|
temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
temp2 = _mm_subs_epi16(zero, temp2);
|
||
|
temp1 = _mm_subs_epi16(zero, temp1);
|
||
|
|
||
|
C_8x16 = _mm_add_epi16(C0_8x16, temp2);
|
||
|
C_hi_8x16 = _mm_add_epi16(C0_hi_8x16, temp1);
|
||
|
|
||
|
// (ABS(q2 - q0) < beta)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
|
||
|
temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag3_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
|
||
|
|
||
|
temp2 = _mm_subs_epi16(zero, temp2);
|
||
|
temp1 = _mm_subs_epi16(zero, temp1);
|
||
|
|
||
|
C_8x16 = _mm_add_epi16(C_8x16, temp2);
|
||
|
C_hi_8x16 = _mm_add_epi16(C_hi_8x16, temp1);
|
||
|
|
||
|
const_val4_8x16 = _mm_set1_epi16(4);
|
||
|
temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
|
||
|
_mm_unpacklo_epi8(p0_16x8, zero));
|
||
|
temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
|
||
|
_mm_unpacklo_epi8(q1_16x8, zero));
|
||
|
temp1 = _mm_slli_epi16(temp1, 2);
|
||
|
temp1 = _mm_add_epi16(temp1, temp2);
|
||
|
temp1 = _mm_add_epi16(temp1, const_val4_8x16);
|
||
|
in_macro_16x8 = _mm_srai_epi16(temp1, 3);
|
||
|
|
||
|
temp1 = _mm_subs_epi16(_mm_unpackhi_epi8(q0_16x8, zero),
|
||
|
_mm_unpackhi_epi8(p0_16x8, zero));
|
||
|
temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p1_16x8, zero),
|
||
|
_mm_unpackhi_epi8(q1_16x8, zero));
|
||
|
temp1 = _mm_slli_epi16(temp1, 2);
|
||
|
temp1 = _mm_add_epi16(temp1, temp2);
|
||
|
temp1 = _mm_add_epi16(temp1, const_val4_8x16);
|
||
|
in_macro_hi_16x8 = _mm_srai_epi16(temp1, 3);
|
||
|
|
||
|
in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
|
||
|
in_macro_hi_16x8 = _mm_min_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
|
||
|
C_8x16 = _mm_subs_epi16(zero, C_8x16);
|
||
|
C_hi_8x16 = _mm_subs_epi16(zero, C_hi_8x16);
|
||
|
in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
|
||
|
in_macro_hi_16x8 = _mm_max_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
|
||
|
|
||
|
temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
|
||
|
temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p0_16x8, zero), in_macro_hi_16x8);
|
||
|
|
||
|
temp1 = _mm_packus_epi16(temp1, temp2);
|
||
|
|
||
|
temp1 = _mm_and_si128(temp1, flag1_16x8);
|
||
|
temp2 = _mm_and_si128(p0_16x8,
|
||
|
_mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
|
||
|
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
_mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), temp1);
|
||
|
|
||
|
temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
|
||
|
temp2 = _mm_sub_epi16(_mm_unpackhi_epi8(q0_16x8, zero), in_macro_hi_16x8);
|
||
|
|
||
|
temp1 = _mm_packus_epi16(temp1, temp2);
|
||
|
|
||
|
temp1 = _mm_and_si128(temp1, flag1_16x8);
|
||
|
temp2 = _mm_and_si128(q0_16x8,
|
||
|
_mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
|
||
|
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
_mm_storeu_si128((__m128i *)(pu1_src), temp1);
|
||
|
|
||
|
//if(Ap < Beta)
|
||
|
temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
|
||
|
_mm_unpacklo_epi8(p0_16x8, zero));
|
||
|
temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
|
||
|
//temp2 = _mm_subs_epi16(zero,temp2);
|
||
|
temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
|
||
|
temp2 = _mm_add_epi16(temp1, temp2);
|
||
|
in_macro_16x8 = _mm_srai_epi16(temp2, 1);
|
||
|
|
||
|
temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
|
||
|
_mm_unpackhi_epi8(p0_16x8, zero));
|
||
|
temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1);
|
||
|
//temp2 = _mm_subs_epi16(zero,temp2);
|
||
|
temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p2_16x8, zero), temp2);
|
||
|
temp2 = _mm_add_epi16(temp1, temp2);
|
||
|
in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
|
||
|
|
||
|
in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
|
||
|
in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
|
||
|
C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
|
||
|
C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
|
||
|
in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
|
||
|
in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
|
||
|
|
||
|
temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
|
||
|
temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p1_16x8, zero), in_macro_hi_16x8);
|
||
|
|
||
|
temp1 = _mm_packus_epi16(temp1, temp2);
|
||
|
|
||
|
temp1 = _mm_and_si128(temp1, flag2_16x8);
|
||
|
temp2 = _mm_and_si128(p1_16x8,
|
||
|
_mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
_mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), temp1);
|
||
|
|
||
|
//if(Aq < Beta)
|
||
|
temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
|
||
|
_mm_unpacklo_epi8(p0_16x8, zero));
|
||
|
temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
|
||
|
//temp2 = _mm_slli_epi16 (temp2, 1);
|
||
|
temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
|
||
|
temp2 = _mm_add_epi16(temp1, temp2);
|
||
|
in_macro_16x8 = _mm_srai_epi16(temp2, 1);
|
||
|
|
||
|
temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
|
||
|
_mm_unpackhi_epi8(p0_16x8, zero));
|
||
|
temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(q1_16x8, zero), 1);
|
||
|
//temp2 = _mm_slli_epi16 (temp2, 1);
|
||
|
temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(q2_16x8, zero), temp2);
|
||
|
temp2 = _mm_add_epi16(temp1, temp2);
|
||
|
in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
|
||
|
|
||
|
in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
|
||
|
in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
|
||
|
C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
|
||
|
C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
|
||
|
in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
|
||
|
in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
|
||
|
|
||
|
temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
|
||
|
temp2 = _mm_add_epi16(_mm_unpackhi_epi8(q1_16x8, zero), in_macro_hi_16x8);
|
||
|
|
||
|
temp1 = _mm_packus_epi16(temp1, temp2);
|
||
|
|
||
|
temp1 = _mm_and_si128(temp1, flag3_16x8);
|
||
|
temp2 = _mm_and_si128(q1_16x8,
|
||
|
_mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
_mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), temp1);
|
||
|
|
||
|
}
|
||
|
|
||
|
/*****************************************************************************/
|
||
|
/* */
|
||
|
/* Function Name : ih264_deblk_luma_vert_bs4_mbaff_ssse3() */
|
||
|
/* */
|
||
|
/* Description : This function performs filtering of a luma block */
|
||
|
/* vertical edge when boundary strength is set to 4. */
|
||
|
/* */
|
||
|
/* Inputs : pu1_src - pointer to the src sample q0 */
|
||
|
/* src_strd - source stride */
|
||
|
/* alpha - alpha value for the boundary */
|
||
|
/* beta - beta value for the boundary */
|
||
|
/* */
|
||
|
/* Globals : None */
|
||
|
/* */
|
||
|
/* Processing : When the function is called twice, this operation is as */
|
||
|
/* described in Sec. 8.7.2.3 under the title "Filtering */
|
||
|
/* process for edges for bS equal to 4" in ITU T Rec H.264. */
|
||
|
/* */
|
||
|
/* Outputs : None */
|
||
|
/* */
|
||
|
/* Returns : None */
|
||
|
/* */
|
||
|
/* Issues : None */
|
||
|
/* */
|
||
|
/* Revision History: */
|
||
|
/* */
|
||
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
/* 12 02 2015 Naveen Kumar P Initial version */
|
||
|
/* */
|
||
|
/*****************************************************************************/
|
||
|
void ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
|
||
|
WORD32 src_strd,
|
||
|
WORD32 alpha,
|
||
|
WORD32 beta)
|
||
|
{
|
||
|
__m128i zero = _mm_setzero_si128();
|
||
|
__m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
|
||
|
__m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
|
||
|
__m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
|
||
|
__m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
|
||
|
__m128i q0_16x8_1;
|
||
|
__m128i p0_16x8_1;
|
||
|
__m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
|
||
|
__m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
|
||
|
__m128i temp1, temp2, temp3, temp4, temp5, temp6;
|
||
|
__m128i Alpha_8x16, Beta_8x16;
|
||
|
__m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
|
||
|
__m128i const_val2_16x8 = _mm_set1_epi16(2);
|
||
|
__m128i line1, line2, line3, line4, line5, line6, line7, line8;
|
||
|
|
||
|
Alpha_8x16 = _mm_set1_epi16(alpha);
|
||
|
Beta_8x16 = _mm_set1_epi16(beta);
|
||
|
|
||
|
line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
|
||
|
line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
|
||
|
line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
|
||
|
line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
|
||
|
line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
|
||
|
line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
|
||
|
line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
|
||
|
line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
|
||
|
|
||
|
temp1 = _mm_unpacklo_epi8(line1, line2);
|
||
|
temp2 = _mm_unpacklo_epi8(line3, line4);
|
||
|
temp3 = _mm_unpacklo_epi8(line5, line6);
|
||
|
temp4 = _mm_unpacklo_epi8(line7, line8);
|
||
|
|
||
|
line1 = _mm_unpacklo_epi16(temp1, temp2);
|
||
|
line2 = _mm_unpackhi_epi16(temp1, temp2);
|
||
|
line3 = _mm_unpacklo_epi16(temp3, temp4);
|
||
|
line4 = _mm_unpackhi_epi16(temp3, temp4);
|
||
|
|
||
|
p1_8x16 = _mm_unpacklo_epi32(line1, line3);
|
||
|
p0_8x16 = _mm_unpackhi_epi32(line1, line3);
|
||
|
q0_8x16 = _mm_unpacklo_epi32(line2, line4);
|
||
|
q1_8x16 = _mm_unpackhi_epi32(line2, line4);
|
||
|
|
||
|
p3_16x8 = _mm_unpacklo_epi64(p1_8x16, zero);
|
||
|
p2_16x8 = _mm_unpackhi_epi64(p1_8x16, zero);
|
||
|
q2_16x8 = _mm_unpacklo_epi64(q1_8x16, zero);
|
||
|
q3_16x8 = _mm_unpackhi_epi64(q1_8x16, zero);
|
||
|
p1_16x8 = _mm_unpacklo_epi64(p0_8x16, zero);
|
||
|
p0_16x8 = _mm_unpackhi_epi64(p0_8x16, zero);
|
||
|
q0_16x8 = _mm_unpacklo_epi64(q0_8x16, zero);
|
||
|
q1_16x8 = _mm_unpackhi_epi64(q0_8x16, zero);
|
||
|
|
||
|
//Cond1 (ABS(p0 - q0) < alpha)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
|
||
|
temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
|
||
|
temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
|
||
|
|
||
|
flag1_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
|
||
|
//Cond2 (ABS(q1 - q0) < beta)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
|
||
|
temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
|
||
|
flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
//Cond3 (ABS(p1 - p0) < beta)
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
|
||
|
temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
|
||
|
// !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
|
||
|
flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
// (ABS(p0 - q0) < ((alpha >> 2) + 2))
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
|
||
|
temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
|
||
|
Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
// (ABS(p2 - p0) < beta)
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
|
||
|
temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag3_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
|
||
|
|
||
|
// (ABS(q2 - q0) < beta)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
|
||
|
temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp1 = _mm_unpackhi_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
|
||
|
|
||
|
flag4_16x8 = _mm_packs_epi16(temp2, temp1);
|
||
|
flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
|
||
|
|
||
|
// First 8 pixels
|
||
|
p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
|
||
|
p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
|
||
|
p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
|
||
|
p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
|
||
|
q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
|
||
|
q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
|
||
|
q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
|
||
|
q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
|
||
|
|
||
|
// p0_1 and q0_1
|
||
|
temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
|
||
|
temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
|
||
|
temp5 = _mm_add_epi16(temp1, const_val2_16x8);
|
||
|
temp6 = _mm_add_epi16(temp2, const_val2_16x8);
|
||
|
temp3 = _mm_slli_epi16(p1_8x16, 1);
|
||
|
temp4 = _mm_slli_epi16(q1_8x16, 1);
|
||
|
temp1 = _mm_add_epi16(temp5, temp3);
|
||
|
temp2 = _mm_add_epi16(temp6, temp4);
|
||
|
p0_16x8_1 = _mm_srai_epi16(temp1, 2);
|
||
|
q0_16x8_1 = _mm_srai_epi16(temp2, 2);
|
||
|
|
||
|
// p1_2 and q1_2
|
||
|
temp6 = _mm_add_epi16(temp6, p0_8x16);
|
||
|
temp5 = _mm_add_epi16(temp5, q0_8x16);
|
||
|
temp1 = _mm_add_epi16(temp6, p2_8x16);
|
||
|
temp2 = _mm_add_epi16(temp5, q2_8x16);
|
||
|
p1_16x8_2 = _mm_srai_epi16(temp1, 2);
|
||
|
q1_16x8_2 = _mm_srai_epi16(temp2, 2);
|
||
|
|
||
|
// p0_2 and q0_2
|
||
|
temp1 = _mm_add_epi16(temp3, p2_8x16);
|
||
|
temp2 = _mm_add_epi16(temp4, q2_8x16);
|
||
|
temp1 = _mm_add_epi16(temp1, q1_8x16);
|
||
|
temp2 = _mm_add_epi16(temp2, p1_8x16);
|
||
|
temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
|
||
|
temp3 = _mm_slli_epi16(temp3, 1);
|
||
|
temp1 = _mm_add_epi16(temp1, temp3);
|
||
|
temp2 = _mm_add_epi16(temp2, temp3);
|
||
|
temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
|
||
|
temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
|
||
|
p0_16x8_2 = _mm_srai_epi16(temp1, 3);
|
||
|
q0_16x8_2 = _mm_srai_epi16(temp2, 3);
|
||
|
|
||
|
// p2_2 and q2_2
|
||
|
temp1 = _mm_add_epi16(temp6, const_val2_16x8);
|
||
|
temp2 = _mm_add_epi16(temp5, const_val2_16x8);
|
||
|
temp3 = _mm_slli_epi16(p2_8x16, 1);
|
||
|
temp4 = _mm_slli_epi16(q2_8x16, 1);
|
||
|
temp3 = _mm_add_epi16(p2_8x16, temp3);
|
||
|
temp4 = _mm_add_epi16(q2_8x16, temp4);
|
||
|
temp5 = _mm_slli_epi16(p3_8x16, 1);
|
||
|
temp6 = _mm_slli_epi16(q3_8x16, 1);
|
||
|
temp1 = _mm_add_epi16(temp1, temp3);
|
||
|
temp2 = _mm_add_epi16(temp2, temp4);
|
||
|
temp1 = _mm_add_epi16(temp1, temp5);
|
||
|
temp2 = _mm_add_epi16(temp2, temp6);
|
||
|
p2_16x8_2 = _mm_srai_epi16(temp1, 3);
|
||
|
q2_16x8_2 = _mm_srai_epi16(temp2, 3);
|
||
|
|
||
|
// p0_1 and q0_1
|
||
|
p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, zero);
|
||
|
q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, zero);
|
||
|
|
||
|
// p1_2 and q1_2
|
||
|
p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, zero);
|
||
|
q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, zero);
|
||
|
|
||
|
// p0_2 and q0_2
|
||
|
p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, zero);
|
||
|
q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, zero);
|
||
|
|
||
|
// p2_2 and q2_2
|
||
|
p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, zero);
|
||
|
q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, zero);
|
||
|
|
||
|
// p0 and q0
|
||
|
p0_16x8 = _mm_and_si128(p0_16x8,
|
||
|
_mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
|
||
|
p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
|
||
|
p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
|
||
|
q0_16x8 = _mm_and_si128(q0_16x8,
|
||
|
_mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
|
||
|
q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
|
||
|
q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
|
||
|
|
||
|
// p0 and q0
|
||
|
p0_16x8 = _mm_and_si128(p0_16x8,
|
||
|
_mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
|
||
|
p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
|
||
|
p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
|
||
|
q0_16x8 = _mm_and_si128(q0_16x8,
|
||
|
_mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
|
||
|
q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
|
||
|
q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
|
||
|
|
||
|
// p1 and q1
|
||
|
p1_16x8 = _mm_and_si128(p1_16x8,
|
||
|
_mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
|
||
|
p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
|
||
|
p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
|
||
|
q1_16x8 = _mm_and_si128(q1_16x8,
|
||
|
_mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
|
||
|
q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
|
||
|
q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
|
||
|
|
||
|
// p2 and q2
|
||
|
p2_16x8 = _mm_and_si128(p2_16x8,
|
||
|
_mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
|
||
|
p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
|
||
|
p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
|
||
|
q2_16x8 = _mm_and_si128(q2_16x8,
|
||
|
_mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
|
||
|
q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
|
||
|
q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
|
||
|
|
||
|
temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
|
||
|
temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
|
||
|
temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
|
||
|
temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
|
||
|
|
||
|
p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
|
||
|
p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
|
||
|
q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
|
||
|
q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
|
||
|
|
||
|
line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
|
||
|
line2 = _mm_srli_si128(line1, 8);
|
||
|
line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
|
||
|
line4 = _mm_srli_si128(line3, 8);
|
||
|
line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
|
||
|
line6 = _mm_srli_si128(line5, 8);
|
||
|
line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
|
||
|
line8 = _mm_srli_si128(line7, 8);
|
||
|
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
|
||
|
|
||
|
}
|
||
|
|
||
|
/*****************************************************************************/
|
||
|
/* */
|
||
|
/* Function Name : ih264_deblk_luma_vert_bslt4_mbaff_ssse3() */
|
||
|
/* */
|
||
|
/* Description : This function performs filtering of a luma block */
|
||
|
/* vertical edge when boundary strength is less than 4. */
|
||
|
/* */
|
||
|
/* Inputs : pu1_src - pointer to the src sample q0 */
|
||
|
/* src_strd - source stride */
|
||
|
/* alpha - alpha value for the boundary */
|
||
|
/* beta - beta value for the boundary */
|
||
|
/* u4_bs - packed Boundary strength array */
|
||
|
/* pu1_cliptab - tc0_table */
|
||
|
/* */
|
||
|
/* Globals : None */
|
||
|
/* */
|
||
|
/* Processing : When the function is called twice, this operation is as */
|
||
|
/* described in Sec. 8.7.2.3 under the title "Filtering */
|
||
|
/* process for edges for bS less than 4" in ITU T Rec H.264.*/
|
||
|
/* */
|
||
|
/* Outputs : None */
|
||
|
/* */
|
||
|
/* Returns : None */
|
||
|
/* */
|
||
|
/* Issues : None */
|
||
|
/* */
|
||
|
/* Revision History: */
|
||
|
/* */
|
||
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
/* 12 02 2015 Naveen Kumar P Initial version */
|
||
|
/* */
|
||
|
/*****************************************************************************/
|
||
|
void ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
|
||
|
WORD32 src_strd,
|
||
|
WORD32 alpha,
|
||
|
WORD32 beta,
|
||
|
UWORD32 u4_bs,
|
||
|
const UWORD8 *pu1_cliptab)
|
||
|
{
|
||
|
__m128i zero = _mm_setzero_si128();
|
||
|
__m128i bs_flag_16x8b, C0_16x8, C0_8x16, C_8x16;
|
||
|
__m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
|
||
|
__m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
|
||
|
__m128i temp1, temp2, temp3, temp4;
|
||
|
__m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
|
||
|
__m128i in_macro_16x8;
|
||
|
__m128i const_val4_8x16;
|
||
|
UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
|
||
|
UWORD8 clip0, clip1, clip2, clip3;
|
||
|
__m128i line1, line2, line3, line4, line5, line6, line7, line8;
|
||
|
__m128i q0_16x8_1, q1_16x8_1, q0_16x8_2;
|
||
|
__m128i p0_16x8_1, p1_16x8_1, p0_16x8_2;
|
||
|
|
||
|
line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
|
||
|
line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
|
||
|
line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
|
||
|
line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
|
||
|
line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
|
||
|
line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
|
||
|
line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
|
||
|
line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
|
||
|
|
||
|
temp1 = _mm_unpacklo_epi8(line1, line2);
|
||
|
temp2 = _mm_unpacklo_epi8(line3, line4);
|
||
|
temp3 = _mm_unpacklo_epi8(line5, line6);
|
||
|
temp4 = _mm_unpacklo_epi8(line7, line8);
|
||
|
|
||
|
line1 = _mm_unpacklo_epi16(temp1, temp2);
|
||
|
line2 = _mm_unpackhi_epi16(temp1, temp2);
|
||
|
line3 = _mm_unpacklo_epi16(temp3, temp4);
|
||
|
line4 = _mm_unpackhi_epi16(temp3, temp4);
|
||
|
|
||
|
temp1 = _mm_unpacklo_epi32(line1, line3);
|
||
|
temp2 = _mm_unpackhi_epi32(line1, line3);
|
||
|
temp3 = _mm_unpacklo_epi32(line2, line4);
|
||
|
temp4 = _mm_unpackhi_epi32(line2, line4);
|
||
|
|
||
|
p3_16x8 = _mm_unpacklo_epi64(temp1, zero);
|
||
|
p2_16x8 = _mm_unpackhi_epi64(temp1, zero);
|
||
|
q2_16x8 = _mm_unpacklo_epi64(temp4, zero);
|
||
|
q3_16x8 = _mm_unpackhi_epi64(temp4, zero);
|
||
|
p1_16x8 = _mm_unpacklo_epi64(temp2, zero);
|
||
|
p0_16x8 = _mm_unpackhi_epi64(temp2, zero);
|
||
|
q0_16x8 = _mm_unpacklo_epi64(temp3, zero);
|
||
|
q1_16x8 = _mm_unpackhi_epi64(temp3, zero);
|
||
|
|
||
|
u1_Bs0 = (u4_bs >> 24) & 0xff;
|
||
|
u1_Bs1 = (u4_bs >> 16) & 0xff;
|
||
|
u1_Bs2 = (u4_bs >> 8) & 0xff;
|
||
|
u1_Bs3 = (u4_bs >> 0) & 0xff;
|
||
|
clip0 = pu1_cliptab[u1_Bs0];
|
||
|
clip1 = pu1_cliptab[u1_Bs1];
|
||
|
clip2 = pu1_cliptab[u1_Bs2];
|
||
|
clip3 = pu1_cliptab[u1_Bs3];
|
||
|
|
||
|
Alpha_8x16 = _mm_set1_epi16(alpha);
|
||
|
Beta_8x16 = _mm_set1_epi16(beta);
|
||
|
|
||
|
bs_flag_16x8b = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
|
||
|
u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
|
||
|
|
||
|
C0_16x8 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, clip3, clip3, clip2, clip2,
|
||
|
clip1, clip1, clip0, clip0);
|
||
|
|
||
|
bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
|
||
|
bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
|
||
|
C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
|
||
|
|
||
|
//Cond1 (ABS(p0 - q0) < alpha)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
|
||
|
temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
|
||
|
|
||
|
flag1_16x8 = _mm_packs_epi16(temp2, zero);
|
||
|
flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
|
||
|
|
||
|
//Cond2 (ABS(q1 - q0) < beta)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
|
||
|
temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, zero);
|
||
|
flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
//Cond3 (ABS(p1 - p0) < beta)
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
|
||
|
temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, zero);
|
||
|
|
||
|
// !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
|
||
|
flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
// (ABS(p2 - p0) < beta)
|
||
|
temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
|
||
|
temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
|
||
|
flag2_16x8 = _mm_packs_epi16(temp2, zero);
|
||
|
flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
|
||
|
|
||
|
temp2 = _mm_subs_epi16(zero, temp2);
|
||
|
|
||
|
C_8x16 = _mm_add_epi16(C0_8x16, temp2);
|
||
|
|
||
|
// (ABS(q2 - q0) < beta)
|
||
|
temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
|
||
|
temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
|
||
|
temp1 = _mm_add_epi8(temp1, temp2);
|
||
|
|
||
|
temp2 = _mm_unpacklo_epi8(temp1, zero);
|
||
|
temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
|
||
|
|
||
|
flag3_16x8 = _mm_packs_epi16(temp2, zero);
|
||
|
flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
|
||
|
|
||
|
temp2 = _mm_subs_epi16(zero, temp2);
|
||
|
|
||
|
C_8x16 = _mm_add_epi16(C_8x16, temp2);
|
||
|
|
||
|
const_val4_8x16 = _mm_set1_epi16(4);
|
||
|
temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
|
||
|
_mm_unpacklo_epi8(p0_16x8, zero));
|
||
|
temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
|
||
|
_mm_unpacklo_epi8(q1_16x8, zero));
|
||
|
temp1 = _mm_slli_epi16(temp1, 2);
|
||
|
temp1 = _mm_add_epi16(temp1, temp2);
|
||
|
temp1 = _mm_add_epi16(temp1, const_val4_8x16);
|
||
|
in_macro_16x8 = _mm_srai_epi16(temp1, 3);
|
||
|
|
||
|
in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
|
||
|
C_8x16 = _mm_subs_epi16(zero, C_8x16);
|
||
|
in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
|
||
|
|
||
|
// p0
|
||
|
temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
|
||
|
|
||
|
temp1 = _mm_packus_epi16(temp1, zero);
|
||
|
|
||
|
p0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
|
||
|
p0_16x8_2 = _mm_and_si128(
|
||
|
p0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
|
||
|
|
||
|
p0_16x8_1 = _mm_add_epi8(p0_16x8_1, p0_16x8_2);
|
||
|
|
||
|
// q0
|
||
|
temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
|
||
|
|
||
|
temp1 = _mm_packus_epi16(temp1, zero);
|
||
|
|
||
|
q0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
|
||
|
q0_16x8_2 = _mm_and_si128(
|
||
|
q0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
|
||
|
|
||
|
q0_16x8_1 = _mm_add_epi8(q0_16x8_1, q0_16x8_2);
|
||
|
|
||
|
//if(Ap < Beta)
|
||
|
temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
|
||
|
_mm_unpacklo_epi8(p0_16x8, zero));
|
||
|
temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
|
||
|
//temp2 = _mm_subs_epi16(zero,temp2);
|
||
|
temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
|
||
|
temp2 = _mm_add_epi16(temp1, temp2);
|
||
|
in_macro_16x8 = _mm_srai_epi16(temp2, 1);
|
||
|
|
||
|
in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
|
||
|
C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
|
||
|
in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
|
||
|
|
||
|
// p1
|
||
|
temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
|
||
|
|
||
|
temp1 = _mm_packus_epi16(temp1, zero);
|
||
|
|
||
|
p1_16x8_1 = _mm_and_si128(temp1, flag2_16x8);
|
||
|
p1_16x8 = _mm_and_si128(p1_16x8,
|
||
|
_mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
|
||
|
p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_1);
|
||
|
|
||
|
//if(Aq < Beta)
|
||
|
temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
|
||
|
_mm_unpacklo_epi8(p0_16x8, zero));
|
||
|
temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
|
||
|
//temp2 = _mm_slli_epi16 (temp2, 1);
|
||
|
temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
|
||
|
temp2 = _mm_add_epi16(temp1, temp2);
|
||
|
in_macro_16x8 = _mm_srai_epi16(temp2, 1);
|
||
|
|
||
|
in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
|
||
|
C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
|
||
|
in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
|
||
|
|
||
|
temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
|
||
|
|
||
|
// q1
|
||
|
temp1 = _mm_packus_epi16(temp1, zero);
|
||
|
|
||
|
q1_16x8_1 = _mm_and_si128(temp1, flag3_16x8);
|
||
|
q1_16x8 = _mm_and_si128(q1_16x8,
|
||
|
_mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
|
||
|
q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_1);
|
||
|
|
||
|
temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
|
||
|
temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8_1);
|
||
|
temp3 = _mm_unpacklo_epi8(q0_16x8_1, q1_16x8);
|
||
|
temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
|
||
|
|
||
|
line7 = _mm_unpacklo_epi16(temp1, temp2);
|
||
|
temp1 = _mm_unpackhi_epi16(temp1, temp2);
|
||
|
line8 = _mm_unpacklo_epi16(temp3, temp4);
|
||
|
temp2 = _mm_unpackhi_epi16(temp3, temp4);
|
||
|
|
||
|
line1 = _mm_unpacklo_epi32(line7, line8);
|
||
|
line2 = _mm_srli_si128(line1, 8);
|
||
|
line3 = _mm_unpackhi_epi32(line7, line8);
|
||
|
line4 = _mm_srli_si128(line3, 8);
|
||
|
line5 = _mm_unpacklo_epi32(temp1, temp2);
|
||
|
line6 = _mm_srli_si128(line5, 8);
|
||
|
line7 = _mm_unpackhi_epi32(temp1, temp2);
|
||
|
line8 = _mm_srli_si128(line7, 8);
|
||
|
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
|
||
|
_mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
|
||
|
}
|
||
|
|