From cdd7a7269a871857019a2db289f25c8b85507f49 Mon Sep 17 00:00:00 2001 From: jinbo Date: Thu, 5 Dec 2024 17:28:45 +0800 Subject: [PATCH 1/2] Fix typo in src/GUI/PageInput.h --- src/GUI/PageInput.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/GUI/PageInput.h b/src/GUI/PageInput.h index e4789549..107bbc59 100644 --- a/src/GUI/PageInput.h +++ b/src/GUI/PageInput.h @@ -301,7 +301,7 @@ private slots: inline unsigned int GetVideoV4L2Width() { return m_spinbox_video_v4l2_width->value(); } inline unsigned int GetVideoV4L2Height() { return m_spinbox_video_v4l2_height->value(); } #endif -#if SSR_USE_V4L2 +#if SSR_USE_PIPEWIRE inline QString GetVideoPipeWireSource() { return m_lineedit_video_pipewire_source->text(); } inline unsigned int GetVideoPipeWireWidth() { return m_spinbox_video_pipewire_width->value(); } inline unsigned int GetVideoPipeWireHeight() { return m_spinbox_video_pipewire_height->value(); } From 7d8a377c1e04c414b2b61364a858abe1a6d32ed9 Mon Sep 17 00:00:00 2001 From: jinbo Date: Tue, 10 Dec 2024 15:13:34 +0800 Subject: [PATCH 2/2] Use simde to accelerate for loongarch We added simd accelerate for scale, converter and filter by using simde to translate sse intrinsics to lsx intrinsics( which is loongarch platform 128-bit simd instruction sets). simde introduction: https://github.com/simd-everywhere/simde --- CMakeLists.txt | 5 + cmake/FindSIMDE.cmake | 13 + src/AV/FastResampler.cpp | 11 +- src/AV/FastResampler_FirFilter.h | 6 + src/AV/FastResampler_FirFilter_LSX.cpp | 90 ++++++ src/AV/FastScaler.cpp | 48 +++ src/AV/FastScaler_Convert.h | 8 + src/AV/FastScaler_Convert_LSX.cpp | 428 +++++++++++++++++++++++++ src/AV/FastScaler_Scale.h | 4 + src/AV/FastScaler_Scale_LSX.cpp | 342 ++++++++++++++++++++ src/Benchmark.cpp | 46 ++- src/CMakeLists.txt | 22 ++ src/Main.cpp | 2 +- src/common/CPUFeatures.cpp | 25 ++ src/common/CPUFeatures.h | 17 + 15 files changed, 1059 insertions(+), 8 deletions(-) create mode 100644 cmake/FindSIMDE.cmake create mode 100644 src/AV/FastResampler_FirFilter_LSX.cpp create mode 100644 src/AV/FastScaler_Convert_LSX.cpp create mode 100644 src/AV/FastScaler_Scale_LSX.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 94f8ccd7..f9059983 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,12 +11,16 @@ project(simplescreenrecorder VERSION 0.4.4) if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686") set(PROCESSOR_IS_X86 TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch") + set(PROCESSOR_IS_LOONGARCH TRUE) else() set(PROCESSOR_IS_X86 FALSE) + set(PROCESSOR_IS_LOONGARCH FALSE) endif() option(ENABLE_32BIT_GLINJECT "Build the 32-bit version of 'libssr-glinject' on 64-bit systems (in addition to the 64-bit version). Required for OpenGL recording of 32-bit applications on 64-bit systems." FALSE) option(ENABLE_X86_ASM "Allow x86/x64 assembly or intrinsics." ${PROCESSOR_IS_X86}) +option(ENABLE_LOONGARCH_ASM "Allow loongarch assembly or intrinsics." ${PROCESSOR_IS_LOONGARCH}) option(ENABLE_FFMPEG_VERSIONS "Use FFmpeg version numbers for feature support tests. Enable when using FFmpeg, disable when using Libav." TRUE) option(ENABLE_JACK_METADATA "Use the JACK metadata API. May not work with very old JACK versions." TRUE) option(WITH_OPENGL_RECORDING "Build with OpenGL recording support." TRUE) @@ -28,6 +32,7 @@ option(WITH_JACK "Build with JACK support." TRUE) option(WITH_QT5 "Build with Qt5 (instead of Qt4)." FALSE) option(WITH_SIMPLESCREENRECORDER "Build the 'simplescreenrecorder' executable." TRUE) option(WITH_GLINJECT "Build the 'libssr-glinject' library. Required for OpenGL recording." TRUE) +option(WITH_SIMDE "Build with simde support." ${PROCESSOR_IS_LOONGARCH}) set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake) diff --git a/cmake/FindSIMDE.cmake b/cmake/FindSIMDE.cmake new file mode 100644 index 00000000..927a2dfd --- /dev/null +++ b/cmake/FindSIMDE.cmake @@ -0,0 +1,13 @@ +# rules for finding the SIMDE library + +find_package(PkgConfig REQUIRED) +pkg_check_modules(PC_SIMDE simde) + +find_path(SIMDE_INCLUDE_DIR simde/x86/sse2.h simde/x86/ssse3.h HINTS ${PC_SIMDE_INCLUDEDIR} ${PC_SIMDE_INCLUDE_DIRS}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(SIMDE DEFAULT_MSG SIMDE_INCLUDE_DIR) + +mark_as_advanced(SIMDE_INCLUDE_DIR) + +set(SIMDE_INCLUDE_DIRS ${SIMDE_INCLUDE_DIR}) diff --git a/src/AV/FastResampler.cpp b/src/AV/FastResampler.cpp index 81a62362..08bb4d0c 100644 --- a/src/AV/FastResampler.cpp +++ b/src/AV/FastResampler.cpp @@ -105,13 +105,22 @@ FastResampler::FastResampler(unsigned int channels, float gain) { default: m_firfilter2_ptr = &FastResampler_FirFilter2_Cn_SSE2; break; } } else { +#endif +#if SSR_USE_LOONGARCH_ASM + if(CPUFeatures::HasLSX()) { + switch(m_channels) { + case 1: m_firfilter2_ptr = &FastResampler_FirFilter2_C1_LSX; break; + case 2: m_firfilter2_ptr = &FastResampler_FirFilter2_C2_LSX; break; + default: m_firfilter2_ptr = &FastResampler_FirFilter2_Cn_LSX; break; + } + } else { #endif switch(m_channels) { case 1: m_firfilter2_ptr = &FastResampler_FirFilter2_C1_Fallback; break; case 2: m_firfilter2_ptr = &FastResampler_FirFilter2_C2_Fallback; break; default: m_firfilter2_ptr = &FastResampler_FirFilter2_Cn_Fallback; break; } -#if SSR_USE_X86_ASM +#if SSR_USE_X86_ASM || SSR_USE_LOONGARCH_ASM } #endif diff --git a/src/AV/FastResampler_FirFilter.h b/src/AV/FastResampler_FirFilter.h index 3ab02ea5..90baf9ce 100644 --- a/src/AV/FastResampler_FirFilter.h +++ b/src/AV/FastResampler_FirFilter.h @@ -31,3 +31,9 @@ void FastResampler_FirFilter2_C1_SSE2(unsigned int channels, unsigned int filter void FastResampler_FirFilter2_C2_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output); void FastResampler_FirFilter2_Cn_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output); #endif + +#if SSR_USE_LOONGARCH_ASM +void FastResampler_FirFilter2_C1_LSX(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output); +void FastResampler_FirFilter2_C2_LSX(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output); +void FastResampler_FirFilter2_Cn_LSX(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output); +#endif diff --git a/src/AV/FastResampler_FirFilter_LSX.cpp b/src/AV/FastResampler_FirFilter_LSX.cpp new file mode 100644 index 00000000..79e9f9c1 --- /dev/null +++ b/src/AV/FastResampler_FirFilter_LSX.cpp @@ -0,0 +1,90 @@ +/* +Copyright (c) 2012-2024 Maarten Baert + +This file is part of SimpleScreenRecorder. + +SimpleScreenRecorder is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +SimpleScreenRecorder is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with SimpleScreenRecorder. If not, see . +*/ + +#include "FastResampler_FirFilter.h" + +#if SSR_USE_LOONGARCH_ASM + +#ifndef SIMDE_ENABLE_NATIVE_ALIASES +#define SIMDE_ENABLE_NATIVE_ALIASES +#endif +//using simde to translate SSE2 to LSX(loongarch 128-bit simd) +#include + +//void FastResampler_FirFilter2_C1_SSE2(...) +void FastResampler_FirFilter2_C1_LSX(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) { + Q_UNUSED(channels); + __m128 sum = _mm_setzero_ps(); + __m128 v_frac = _mm_set1_ps(frac); + for(unsigned int i = 0; i < filter_length / 4; ++i) { + __m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2); + coef1 += 4; coef2 += 4; + __m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac)); + __m128 v_input = _mm_loadu_ps(input); + input += 4; + sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value)); + } + __m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e)); + __m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01)); + _mm_store_ss(output, sum3); +} + +//void FastResampler_FirFilter2_C2_SSE2(...) +void FastResampler_FirFilter2_C2_LSX(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) { + Q_UNUSED(channels); + __m128 sum = _mm_setzero_ps(); + __m128 v_frac = _mm_set1_ps(frac); + for(unsigned int i = 0; i < filter_length / 4; ++i) { + __m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2); + coef1 += 4; coef2 += 4; + __m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac)); + __m128 v_input1 = _mm_loadu_ps(input), v_input2 = _mm_loadu_ps(input + 4); + input += 8; + sum = _mm_add_ps(sum, _mm_mul_ps(v_input1, _mm_unpacklo_ps(filter_value, filter_value))); + sum = _mm_add_ps(sum, _mm_mul_ps(v_input2, _mm_unpackhi_ps(filter_value, filter_value))); + } + __m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0xee)); + _mm_store_sd((double*) output, _mm_castps_pd(sum2)); +} + +//void FastResampler_FirFilter2_Cn_SSE2(...) +void FastResampler_FirFilter2_Cn_LSX(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) { + Q_UNUSED(channels); + for(unsigned int c = 0; c < channels; ++c) { + __m128 sum = _mm_setzero_ps(); + __m128 v_frac = _mm_set1_ps(frac); + float *input2 = input + c; + for(unsigned int i = 0; i < filter_length / 4; ++i) { + __m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2); + coef1 += 4; coef2 += 4; + __m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac)); + __m128 v_input1 = _mm_load_ss(input2); input2 += channels; + __m128 v_input2 = _mm_load_ss(input2); input2 += channels; + __m128 v_input3 = _mm_load_ss(input2); input2 += channels; + __m128 v_input4 = _mm_load_ss(input2); input2 += channels; + __m128 v_input = _mm_movelh_ps(_mm_unpacklo_ps(v_input1, v_input2), _mm_unpacklo_ps(v_input3, v_input4)); + sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value)); + } + __m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e)); + __m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01)); + _mm_store_ss(output + c, sum3); + } +} + +#endif diff --git a/src/AV/FastScaler.cpp b/src/AV/FastScaler.cpp index 3ef7ecb2..462b48c2 100644 --- a/src/AV/FastScaler.cpp +++ b/src/AV/FastScaler.cpp @@ -165,6 +165,14 @@ void FastScaler::Convert_BGRA_YUV444(unsigned int width, unsigned int height, co return; } #endif +#if SSR_USE_LOONGARCH_ASM + if(CPUFeatures::HasLSX()) { + Convert_BGRA_YUV444_LSX(width, height, in_data, in_stride, out_data, out_stride); + } else { + Convert_BGRA_YUV444_Fallback(width, height, in_data, in_stride, out_data, out_stride); + } + return; +#endif Convert_BGRA_YUV444_Fallback(width, height, in_data, in_stride, out_data, out_stride); @@ -190,6 +198,14 @@ void FastScaler::Convert_BGRA_YUV422(unsigned int width, unsigned int height, co return; } #endif +#if SSR_USE_LOONGARCH_ASM + if(CPUFeatures::HasLSX()) { + Convert_BGRA_YUV422_LSX(width, height, in_data, in_stride, out_data, out_stride); + } else { + Convert_BGRA_YUV422_Fallback(width, height, in_data, in_stride, out_data, out_stride); + } + return; +#endif Convert_BGRA_YUV422_Fallback(width, height, in_data, in_stride, out_data, out_stride); @@ -215,6 +231,14 @@ void FastScaler::Convert_BGRA_YUV420(unsigned int width, unsigned int height, co return; } #endif +#if SSR_USE_LOONGARCH_ASM + if(CPUFeatures::HasLSX()) { + Convert_BGRA_YUV420_LSX(width, height, in_data, in_stride, out_data, out_stride); + } else { + Convert_BGRA_YUV420_Fallback(width, height, in_data, in_stride, out_data, out_stride); + } + return; +#endif Convert_BGRA_YUV420_Fallback(width, height, in_data, in_stride, out_data, out_stride); @@ -239,6 +263,14 @@ void FastScaler::Convert_BGRA_NV12(unsigned int width, unsigned int height, cons return; } #endif +#if SSR_USE_LOONGARCH_ASM + if(CPUFeatures::HasLSX()) { + Convert_BGRA_NV12_LSX(width, height, in_data, in_stride, out_data, out_stride); + } else { + Convert_BGRA_NV12_Fallback(width, height, in_data, in_stride, out_data, out_stride); + } + return; +#endif Convert_BGRA_NV12_Fallback(width, height, in_data, in_stride, out_data, out_stride); @@ -261,6 +293,14 @@ void FastScaler::Convert_BGRA_BGR(unsigned int width, unsigned int height, const return; } #endif +#if SSR_USE_LOONGARCH_ASM + if(CPUFeatures::HasLSX()) { + Convert_BGRA_BGR_LSX(width, height, in_data, in_stride, out_data, out_stride); + } else { + Convert_BGRA_BGR_Fallback(width, height, in_data, in_stride, out_data, out_stride); + } + return; +#endif Convert_BGRA_BGR_Fallback(width, height, in_data, in_stride, out_data, out_stride); @@ -284,6 +324,14 @@ void FastScaler::Scale_BGRA(unsigned int in_width, unsigned int in_height, const return; } #endif +#if SSR_USE_LOONGARCH_ASM + if(CPUFeatures::HasLSX()) { + Scale_BGRA_LSX(in_width, in_height, in_data, in_stride, out_width, out_height, out_data, out_stride); + } else { + Scale_BGRA_Fallback(in_width, in_height, in_data, in_stride, out_width, out_height, out_data, out_stride); + } + return; +#endif Scale_BGRA_Fallback(in_width, in_height, in_data, in_stride, out_width, out_height, out_data, out_stride); diff --git a/src/AV/FastScaler_Convert.h b/src/AV/FastScaler_Convert.h index 91dec7fb..6d10c4d1 100644 --- a/src/AV/FastScaler_Convert.h +++ b/src/AV/FastScaler_Convert.h @@ -33,3 +33,11 @@ void Convert_BGRA_YUV420_SSSE3(unsigned int w, unsigned int h, const uint8_t* in void Convert_BGRA_NV12_SSSE3(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[2], const int out_stride[2]); void Convert_BGRA_BGR_SSSE3(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* out_data, int out_stride); #endif + +#if SSR_USE_LOONGARCH_ASM +void Convert_BGRA_YUV444_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]); +void Convert_BGRA_YUV422_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]); +void Convert_BGRA_YUV420_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]); +void Convert_BGRA_NV12_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[2], const int out_stride[2]); +void Convert_BGRA_BGR_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* out_data, int out_stride); +#endif diff --git a/src/AV/FastScaler_Convert_LSX.cpp b/src/AV/FastScaler_Convert_LSX.cpp new file mode 100644 index 00000000..cf9ffa57 --- /dev/null +++ b/src/AV/FastScaler_Convert_LSX.cpp @@ -0,0 +1,428 @@ +/* +Copyright (c) 2012-2024 Maarten Baert + +This file is part of SimpleScreenRecorder. + +SimpleScreenRecorder is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +SimpleScreenRecorder is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with SimpleScreenRecorder. If not, see . +*/ + +#include "FastScaler_Convert.h" + +#if SSR_USE_LOONGARCH_ASM + +#ifndef SIMDE_ENABLE_NATIVE_ALIASES +#define SIMDE_ENABLE_NATIVE_ALIASES +#endif +//using simde to translate SSSE3 to LSX(loongarch 128-bit simd) +#include +/* +==== SSSE3 BGRA-to-YUV444/YUV420 Converter ==== + +Uses the same principle as the fallback converter, but uses 16-bit integers so it can do 8 operations at once. +- YUV444: takes blocks of 16x1 pixels, produces 16x1 Y/U/V values +- YUV422: takes blocks of 16x1 pixels, produces 16x1 Y and 8x1 U/V values +- YUV420: takes blocks of 16x2 pixels, produces 16x2 Y and 8x1 U/V values + +The code uses interleaving to reduce the number of shuffles. So for example the order for red is [ r0 r4 r1 r5 r2 r6 r3 r7 ]. +For the averaging of 2x2 blocks, it uses 32-bit horizontal addition instead of 16-bit because of this interleaving. +The order of the final result is [ sr0 sr2 sr1 sr3 sr4 sr6 sr5 sr7 ]. + +If the width is not a multiple of 8/16, the remainder (right edge of the image) is converted without SSSE3. + +This converter is about 4 times faster than the fallback converter. +*/ + +#define ReadBGRAInterleaved(ptr1, ptr2, ca, cb, r, g, b) \ + __m128i ca = _mm_loadu_si128((__m128i*) (ptr1)), cb = _mm_loadu_si128((__m128i*) (ptr2)); \ + __m128i r = _mm_or_si128(_mm_and_si128(_mm_srli_si128(ca, 2), v_byte1), _mm_and_si128( cb , v_byte3)); \ + __m128i g = _mm_or_si128(_mm_and_si128(_mm_srli_si128(ca, 1), v_byte1), _mm_and_si128(_mm_slli_si128(cb, 1), v_byte3)); \ + __m128i b = _mm_or_si128(_mm_and_si128( ca , v_byte1), _mm_and_si128(_mm_slli_si128(cb, 2), v_byte3)); +#define Convert_RGB_Y(r, g, b, y) \ + __m128i y = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(r, v_mat_yr), _mm_mullo_epi16(g, v_mat_yg)), _mm_add_epi16(_mm_mullo_epi16(b, v_mat_yb), v_offset_y)); +#define Convert_RGB_U(r, g, b, u) \ + __m128i u = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(r, v_mat_ur), _mm_mullo_epi16(g, v_mat_ug)), _mm_add_epi16(_mm_mullo_epi16(b, v_mat_ub_vr), v_offset_uv)); +#define Convert_RGB_V(r, g, b, v) \ + __m128i v = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(r, v_mat_ub_vr), _mm_mullo_epi16(g, v_mat_vg)), _mm_add_epi16(_mm_mullo_epi16(b, v_mat_vb), v_offset_uv)); +#define WritePlaneInterleaved(ptr, y1, y2, sh1, sh2) \ + _mm_stream_si128((__m128i*) (ptr), _mm_or_si128(_mm_shuffle_epi8(y1, sh1), _mm_shuffle_epi8(y2, sh2))); + +//void Convert_BGRA_YUV444_SSSE3(...) +void Convert_BGRA_YUV444_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) { + assert((uintptr_t) out_data[0] % 16 == 0 && out_stride[0] % 16 == 0); + assert((uintptr_t) out_data[1] % 16 == 0 && out_stride[1] % 16 == 0); + assert((uintptr_t) out_data[2] % 16 == 0 && out_stride[2] % 16 == 0); + + __m128i v_byte1 = _mm_set1_epi32(0x000000ff); + __m128i v_byte3 = _mm_set1_epi32(0x00ff0000); + __m128i v_mat_yr = _mm_set1_epi16(47); + __m128i v_mat_yg = _mm_set1_epi16(157); + __m128i v_mat_yb = _mm_set1_epi16(16); + __m128i v_mat_ur = _mm_set1_epi16(-26); + __m128i v_mat_ug = _mm_set1_epi16(-86); + __m128i v_mat_ub_vr = _mm_set1_epi16(112); + __m128i v_mat_vg = _mm_set1_epi16(-102); + __m128i v_mat_vb = _mm_set1_epi16(-10); + __m128i v_offset_y = _mm_set1_epi16((int16_t) (128 + (16 << 8))); + __m128i v_offset_uv = _mm_set1_epi16((int16_t) (128 + (128 << 8))); + __m128i v_shuffle1 = _mm_setr_epi8(1, 5, 9, 13, 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i v_shuffle2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 5, 9, 13, 3, 7, 11, 15); + + const int offset_y = 128 + (16 << 8), offset_uv = 128 + (128 << 8); + + for(unsigned int j = 0; j < h; ++j) { + const uint32_t *rgb = (const uint32_t*) (in_data + in_stride * (int) j); + uint8_t *yuv_y = out_data[0] + out_stride[0] * (int) j; + uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j; + uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j; + for(unsigned int i = 0; i < w / 16; ++i) { + ReadBGRAInterleaved(rgb , rgb + 4, ca1, cb1, r1, g1, b1); + ReadBGRAInterleaved(rgb + 8, rgb + 12, ca2, cb2, r2, g2, b2); + _mm_prefetch(rgb + 48, _MM_HINT_T0); + rgb += 16; + Convert_RGB_Y(r1, g1, b1, y1); + Convert_RGB_Y(r2, g2, b2, y2); + WritePlaneInterleaved(yuv_y, y1, y2, v_shuffle1, v_shuffle2); + yuv_y += 16; + Convert_RGB_U(r1, g1, b1, u1); + Convert_RGB_U(r2, g2, b2, u2); + WritePlaneInterleaved(yuv_u, u1, u2, v_shuffle1, v_shuffle2); + yuv_u += 16; + Convert_RGB_V(r1, g1, b1, v1); + Convert_RGB_V(r2, g2, b2, v2); + WritePlaneInterleaved(yuv_v, v1, v2, v_shuffle1, v_shuffle2); + yuv_v += 16; + } + for(unsigned int i = 0; i < (w & 15); ++i) { + uint32_t c = *(rgb++); + int r = (int) ((c >> 16) & 0xff); + int g = (int) ((c >> 8) & 0xff); + int b = (int) ((c ) & 0xff); + *(yuv_y++) = ( 47 * r + 157 * g + 16 * b + offset_y) >> 8; + *(yuv_u++) = (-26 * r + -86 * g + 112 * b + offset_uv) >> 8; + *(yuv_v++) = (112 * r + -102 * g + -10 * b + offset_uv) >> 8; + } + } + + _mm_sfence(); + +} + +//void Convert_BGRA_YUV422_SSSE3(...) +void Convert_BGRA_YUV422_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) { + assert(w % 2 == 0); + assert((uintptr_t) out_data[0] % 16 == 0 && out_stride[0] % 16 == 0); + assert((uintptr_t) out_data[1] % 16 == 0 && out_stride[1] % 16 == 0); + assert((uintptr_t) out_data[2] % 16 == 0 && out_stride[2] % 16 == 0); + + __m128i v_byte1 = _mm_set1_epi32(0x000000ff); + __m128i v_byte3 = _mm_set1_epi32(0x00ff0000); + __m128i v_mat_yr = _mm_set1_epi16(47); + __m128i v_mat_yg = _mm_set1_epi16(157); + __m128i v_mat_yb = _mm_set1_epi16(16); + __m128i v_mat_ur = _mm_set1_epi16(-26); + __m128i v_mat_ug = _mm_set1_epi16(-86); + __m128i v_mat_ub_vr = _mm_set1_epi16(112); + __m128i v_mat_vg = _mm_set1_epi16(-102); + __m128i v_mat_vb = _mm_set1_epi16(-10); + __m128i v_offset_y = _mm_set1_epi16((int16_t) (128 + (16 << 8))); + __m128i v_offset_uv = _mm_set1_epi16((int16_t) (128 + (128 << 8))); + __m128i v_shuffle1 = _mm_setr_epi8(1, 5, 9, 13, 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i v_shuffle2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 5, 9, 13, 3, 7, 11, 15); + __m128i v_shuffle3 = _mm_setr_epi8(1, 5, 3, 7, 9, 13, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1); + + const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 1; + + for(unsigned int j = 0; j < h; ++j) { + const uint32_t *rgb = (const uint32_t*) (in_data + in_stride * (int) j); + uint8_t *yuv_y = out_data[0] + out_stride[0] * (int) j; + uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j; + uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j; + for(unsigned int i = 0; i < w / 16; ++i) { + ReadBGRAInterleaved(rgb , rgb + 4, ca1, cb1, r1, g1, b1); + ReadBGRAInterleaved(rgb + 8, rgb + 12, ca2, cb2, r2, g2, b2); + _mm_prefetch(rgb + 48, _MM_HINT_T0); + rgb += 16; + Convert_RGB_Y(r1, g1, b1, y1); + Convert_RGB_Y(r2, g2, b2, y2); + WritePlaneInterleaved(yuv_y, y1, y2, v_shuffle1, v_shuffle2); + yuv_y += 16; + __m128i ra = _mm_srli_epi16(_mm_hadd_epi32(r1, r2), 1); + __m128i ga = _mm_srli_epi16(_mm_hadd_epi32(g1, g2), 1); + __m128i ba = _mm_srli_epi16(_mm_hadd_epi32(b1, b2), 1); + __m128i u = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ur), _mm_mullo_epi16(ga, v_mat_ug)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_ub_vr), v_offset_uv)); + _mm_storel_epi64((__m128i*) yuv_u, _mm_shuffle_epi8(u, v_shuffle3)); + yuv_u += 8; + __m128i v = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ub_vr), _mm_mullo_epi16(ga, v_mat_vg)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_vb), v_offset_uv)); + _mm_storel_epi64((__m128i*) yuv_v, _mm_shuffle_epi8(v, v_shuffle3)); + yuv_v += 8; + } + for(unsigned int i = 0; i < (w & 15) / 2; ++i) { + uint32_t c1 = rgb[0], c2 = rgb[1]; + rgb += 2; + int r1 = (int) ((c1 >> 16) & 0xff), r2 = (int) ((c2 >> 16) & 0xff); + int g1 = (int) ((c1 >> 8) & 0xff), g2 = (int) ((c2 >> 8) & 0xff); + int b1 = (int) ((c1 ) & 0xff), b2 = (int) ((c2 ) & 0xff); + yuv_y[0] = (47 * r1 + 157 * g1 + 16 * b1 + offset_y) >> 8; + yuv_y[1] = (47 * r2 + 157 * g2 + 16 * b2 + offset_y) >> 8; + yuv_y += 2; + int sr = r1 + r2; + int sg = g1 + g2; + int sb = b1 + b2; + *(yuv_u++) = (-26 * sr + -86 * sg + 112 * sb + offset_uv) >> 9; + *(yuv_v++) = (112 * sr + -102 * sg + -10 * sb + offset_uv) >> 9; + } + } + + _mm_sfence(); + +} + +//void Convert_BGRA_YUV420_SSSE3(...) +void Convert_BGRA_YUV420_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) { + assert(w % 2 == 0 && h % 2 == 0); + assert((uintptr_t) out_data[0] % 16 == 0 && out_stride[0] % 16 == 0); + assert((uintptr_t) out_data[1] % 16 == 0 && out_stride[1] % 16 == 0); + assert((uintptr_t) out_data[2] % 16 == 0 && out_stride[2] % 16 == 0); + + __m128i v_byte1 = _mm_set1_epi32(0x000000ff); + __m128i v_byte3 = _mm_set1_epi32(0x00ff0000); + __m128i v_mat_yr = _mm_set1_epi16(47); + __m128i v_mat_yg = _mm_set1_epi16(157); + __m128i v_mat_yb = _mm_set1_epi16(16); + __m128i v_mat_ur = _mm_set1_epi16(-26); + __m128i v_mat_ug = _mm_set1_epi16(-86); + __m128i v_mat_ub_vr = _mm_set1_epi16(112); + __m128i v_mat_vg = _mm_set1_epi16(-102); + __m128i v_mat_vb = _mm_set1_epi16(-10); + __m128i v_offset_y = _mm_set1_epi16((int16_t) (128 + (16 << 8))); + __m128i v_offset_uv = _mm_set1_epi16((int16_t) (128 + (128 << 8))); + __m128i v_2 = _mm_set1_epi16(2); + __m128i v_shuffle1 = _mm_setr_epi8(1, 5, 9, 13, 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i v_shuffle2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 5, 9, 13, 3, 7, 11, 15); + __m128i v_shuffle3 = _mm_setr_epi8(1, 5, 3, 7, 9, 13, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1); + + const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 2; + + for(unsigned int j = 0; j < h / 2; ++j) { + const uint32_t *rgb1 = (const uint32_t*) (in_data + in_stride * (int) (j * 2)); + const uint32_t *rgb2 = (const uint32_t*) (in_data + in_stride * (int) (j * 2 + 1)); + uint8_t *yuv_y1 = out_data[0] + out_stride[0] * (int) (j * 2); + uint8_t *yuv_y2 = out_data[0] + out_stride[0] * (int) (j * 2 + 1); + uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j; + uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j; + for(unsigned int i = 0; i < w / 16; ++i) { + __m128i ra, ga, ba; + { + ReadBGRAInterleaved(rgb1 , rgb1 + 4, ca1, cb1, r1, g1, b1); + ReadBGRAInterleaved(rgb1 + 8, rgb1 + 12, ca2, cb2, r2, g2, b2); + rgb1 += 16; + Convert_RGB_Y(r1, g1, b1, y1); + Convert_RGB_Y(r2, g2, b2, y2); + WritePlaneInterleaved(yuv_y1, y1, y2, v_shuffle1, v_shuffle2); + yuv_y1 += 16; + _mm_prefetch(rgb1 + 16, _MM_HINT_T0); + ra = _mm_hadd_epi32(r1, r2); + ga = _mm_hadd_epi32(g1, g2); + ba = _mm_hadd_epi32(b1, b2); + } + { + ReadBGRAInterleaved(rgb2 , rgb2 + 4, ca1, cb1, r1, g1, b1); + ReadBGRAInterleaved(rgb2 + 8, rgb2 + 12, ca2, cb2, r2, g2, b2); + rgb2 += 16; + Convert_RGB_Y(r1, g1, b1, y1); + Convert_RGB_Y(r2, g2, b2, y2); + WritePlaneInterleaved(yuv_y2, y1, y2, v_shuffle1, v_shuffle2); + yuv_y2 += 16; + _mm_prefetch(rgb2 + 16, _MM_HINT_T0); + ra = _mm_add_epi16(ra, _mm_hadd_epi32(r1, r2)); + ga = _mm_add_epi16(ga, _mm_hadd_epi32(g1, g2)); + ba = _mm_add_epi16(ba, _mm_hadd_epi32(b1, b2)); + } + { + ra = _mm_srli_epi16(_mm_add_epi16(ra, v_2), 2); + ga = _mm_srli_epi16(_mm_add_epi16(ga, v_2), 2); + ba = _mm_srli_epi16(_mm_add_epi16(ba, v_2), 2); + __m128i u = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ur), _mm_mullo_epi16(ga, v_mat_ug)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_ub_vr), v_offset_uv)); + _mm_storel_epi64((__m128i*) yuv_u, _mm_shuffle_epi8(u, v_shuffle3)); + yuv_u += 8; + __m128i v = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ub_vr), _mm_mullo_epi16(ga, v_mat_vg)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_vb), v_offset_uv)); + _mm_storel_epi64((__m128i*) yuv_v, _mm_shuffle_epi8(v, v_shuffle3)); + yuv_v += 8; + } + } + for(unsigned int i = 0; i < (w & 15) / 2; ++i) { + uint32_t c1 = rgb1[0], c2 = rgb1[1], c3 = rgb2[0], c4 = rgb2[1]; + rgb1 += 2; rgb2 += 2; + int r1 = (int) ((c1 >> 16) & 0xff), r2 = (int) ((c2 >> 16) & 0xff), r3 = (int) ((c3 >> 16) & 0xff), r4 = (int) ((c4 >> 16) & 0xff); + int g1 = (int) ((c1 >> 8) & 0xff), g2 = (int) ((c2 >> 8) & 0xff), g3 = (int) ((c3 >> 8) & 0xff), g4 = (int) ((c4 >> 8) & 0xff); + int b1 = (int) ((c1 ) & 0xff), b2 = (int) ((c2 ) & 0xff), b3 = (int) ((c3 ) & 0xff), b4 = (int) ((c4 ) & 0xff); + yuv_y1[0] = (47 * r1 + 157 * g1 + 16 * b1 + offset_y) >> 8; + yuv_y1[1] = (47 * r2 + 157 * g2 + 16 * b2 + offset_y) >> 8; + yuv_y2[0] = (47 * r3 + 157 * g3 + 16 * b3 + offset_y) >> 8; + yuv_y2[1] = (47 * r4 + 157 * g4 + 16 * b4 + offset_y) >> 8; + yuv_y1 += 2; yuv_y2 += 2; + int sr = r1 + r2 + r3 + r4; + int sg = g1 + g2 + g3 + g4; + int sb = b1 + b2 + b3 + b4; + *(yuv_u++) = (-26 * sr + -86 * sg + 112 * sb + offset_uv) >> 10; + *(yuv_v++) = (112 * sr + -102 * sg + -10 * sb + offset_uv) >> 10; + } + } + + _mm_sfence(); + +} + +//void Convert_BGRA_NV12_SSSE3(...) +void Convert_BGRA_NV12_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[2], const int out_stride[2]) { + assert(w % 2 == 0 && h % 2 == 0); + assert((uintptr_t) out_data[0] % 16 == 0 && out_stride[0] % 16 == 0); + assert((uintptr_t) out_data[1] % 16 == 0 && out_stride[1] % 16 == 0); + + __m128i v_byte1 = _mm_set1_epi32(0x000000ff); + __m128i v_byte3 = _mm_set1_epi32(0x00ff0000); + __m128i v_mat_yr = _mm_set1_epi16(47); + __m128i v_mat_yg = _mm_set1_epi16(157); + __m128i v_mat_yb = _mm_set1_epi16(16); + __m128i v_mat_ur = _mm_set1_epi16(-26); + __m128i v_mat_ug = _mm_set1_epi16(-86); + __m128i v_mat_ub_vr = _mm_set1_epi16(112); + __m128i v_mat_vg = _mm_set1_epi16(-102); + __m128i v_mat_vb = _mm_set1_epi16(-10); + __m128i v_offset_y = _mm_set1_epi16((int16_t) (128 + (16 << 8))); + __m128i v_offset_uv = _mm_set1_epi16((int16_t) (128 + (128 << 8))); + __m128i v_2 = _mm_set1_epi16(2); + __m128i v_shuffle1 = _mm_setr_epi8( 1, 5, 9, 13, 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i v_shuffle2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 5, 9, 13, 3, 7, 11, 15); + __m128i v_shuffle3 = _mm_setr_epi8( 1, -1, 5, -1, 3, -1, 7, -1, 9, -1, 13, -1, 11, -1, 15, -1); + __m128i v_shuffle4 = _mm_setr_epi8(-1, 1, -1, 5, -1, 3, -1, 7, -1, 9, -1, 13, -1, 11, -1, 15); + + const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 2; + + for(unsigned int j = 0; j < h / 2; ++j) { + const uint32_t *rgb1 = (const uint32_t*) (in_data + in_stride * (int) (j * 2)); + const uint32_t *rgb2 = (const uint32_t*) (in_data + in_stride * (int) (j * 2 + 1)); + uint8_t *yuv_y1 = out_data[0] + out_stride[0] * (int) (j * 2); + uint8_t *yuv_y2 = out_data[0] + out_stride[0] * (int) (j * 2 + 1); + uint8_t *yuv_uv = out_data[1] + out_stride[1] * (int) j; + for(unsigned int i = 0; i < w / 16; ++i) { + __m128i ra, ga, ba; + { + ReadBGRAInterleaved(rgb1 , rgb1 + 4, ca1, cb1, r1, g1, b1); + ReadBGRAInterleaved(rgb1 + 8, rgb1 + 12, ca2, cb2, r2, g2, b2); + rgb1 += 16; + Convert_RGB_Y(r1, g1, b1, y1); + Convert_RGB_Y(r2, g2, b2, y2); + WritePlaneInterleaved(yuv_y1, y1, y2, v_shuffle1, v_shuffle2); + yuv_y1 += 16; + _mm_prefetch(rgb1 + 16, _MM_HINT_T0); + ra = _mm_hadd_epi32(r1, r2); + ga = _mm_hadd_epi32(g1, g2); + ba = _mm_hadd_epi32(b1, b2); + } + { + ReadBGRAInterleaved(rgb2 , rgb2 + 4, ca1, cb1, r1, g1, b1); + ReadBGRAInterleaved(rgb2 + 8, rgb2 + 12, ca2, cb2, r2, g2, b2); + rgb2 += 16; + Convert_RGB_Y(r1, g1, b1, y1); + Convert_RGB_Y(r2, g2, b2, y2); + WritePlaneInterleaved(yuv_y2, y1, y2, v_shuffle1, v_shuffle2); + yuv_y2 += 16; + _mm_prefetch(rgb2 + 16, _MM_HINT_T0); + ra = _mm_add_epi16(ra, _mm_hadd_epi32(r1, r2)); + ga = _mm_add_epi16(ga, _mm_hadd_epi32(g1, g2)); + ba = _mm_add_epi16(ba, _mm_hadd_epi32(b1, b2)); + } + { + ra = _mm_srli_epi16(_mm_add_epi16(ra, v_2), 2); + ga = _mm_srli_epi16(_mm_add_epi16(ga, v_2), 2); + ba = _mm_srli_epi16(_mm_add_epi16(ba, v_2), 2); + __m128i u = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ur), _mm_mullo_epi16(ga, v_mat_ug)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_ub_vr), v_offset_uv)); + __m128i v = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ub_vr), _mm_mullo_epi16(ga, v_mat_vg)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_vb), v_offset_uv)); + WritePlaneInterleaved(yuv_uv, u, v, v_shuffle3, v_shuffle4); + yuv_uv += 16; + } + } + for(unsigned int i = 0; i < (w & 15) / 2; ++i) { + uint32_t c1 = rgb1[0], c2 = rgb1[1], c3 = rgb2[0], c4 = rgb2[1]; + rgb1 += 2; rgb2 += 2; + int r1 = (int) ((c1 >> 16) & 0xff), r2 = (int) ((c2 >> 16) & 0xff), r3 = (int) ((c3 >> 16) & 0xff), r4 = (int) ((c4 >> 16) & 0xff); + int g1 = (int) ((c1 >> 8) & 0xff), g2 = (int) ((c2 >> 8) & 0xff), g3 = (int) ((c3 >> 8) & 0xff), g4 = (int) ((c4 >> 8) & 0xff); + int b1 = (int) ((c1 ) & 0xff), b2 = (int) ((c2 ) & 0xff), b3 = (int) ((c3 ) & 0xff), b4 = (int) ((c4 ) & 0xff); + yuv_y1[0] = (47 * r1 + 157 * g1 + 16 * b1 + offset_y) >> 8; + yuv_y1[1] = (47 * r2 + 157 * g2 + 16 * b2 + offset_y) >> 8; + yuv_y2[0] = (47 * r3 + 157 * g3 + 16 * b3 + offset_y) >> 8; + yuv_y2[1] = (47 * r4 + 157 * g4 + 16 * b4 + offset_y) >> 8; + yuv_y1 += 2; yuv_y2 += 2; + int sr = r1 + r2 + r3 + r4; + int sg = g1 + g2 + g3 + g4; + int sb = b1 + b2 + b3 + b4; + yuv_uv[0] = (-26 * sr + -86 * sg + 112 * sb + offset_uv) >> 10; + yuv_uv[1] = (112 * sr + -102 * sg + -10 * sb + offset_uv) >> 10; + yuv_uv += 2; + } + } + + _mm_sfence(); + +} + +/* +==== SSSE3 BGRA-to-BGR Converter ==== + +Same as the fallback converter, but with a larger block size and shuffles instead of shifts and bitwise or. +- BGR: converts blocks of 16x1 pixels +*/ + +//void Convert_BGRA_BGR_SSSE3(...) +void Convert_BGRA_BGR_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* out_data, int out_stride) { + assert((uintptr_t) out_data % 16 == 0 && out_stride % 16 == 0); + + __m128i v_shuffle1 = _mm_setr_epi8( 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1); + __m128i v_shuffle2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 4); + __m128i v_shuffle3 = _mm_setr_epi8( 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i v_shuffle4 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 4, 5, 6, 8, 9); + __m128i v_shuffle5 = _mm_setr_epi8(10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i v_shuffle6 = _mm_setr_epi8(-1, -1, -1, -1, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14); + + for(unsigned int j = 0; j < h; ++j) { + const uint8_t *in = in_data + in_stride * (int) j; + uint8_t *out = out_data + out_stride * (int) j; + for(unsigned int i = 0; i < w / 16; ++i) { + __m128i c0 = _mm_loadu_si128((__m128i*) (in )); + __m128i c1 = _mm_loadu_si128((__m128i*) (in + 16)); + __m128i c2 = _mm_loadu_si128((__m128i*) (in + 32)); + __m128i c3 = _mm_loadu_si128((__m128i*) (in + 48)); + //_mm_prefetch(in + 192, _MM_HINT_T0); + in += 64; + _mm_stream_si128((__m128i*) (out ), _mm_or_si128(_mm_shuffle_epi8(c0, v_shuffle1), _mm_shuffle_epi8(c1, v_shuffle2))); + _mm_stream_si128((__m128i*) (out + 16), _mm_or_si128(_mm_shuffle_epi8(c1, v_shuffle3), _mm_shuffle_epi8(c2, v_shuffle4))); + _mm_stream_si128((__m128i*) (out + 32), _mm_or_si128(_mm_shuffle_epi8(c2, v_shuffle5), _mm_shuffle_epi8(c3, v_shuffle6))); + out += 48; + } + for(unsigned int i = 0; i < (w & 15); ++i) { + uint32_t c = *((uint32_t*) in); + in += 4; + out[0] = c; + out[1] = c >> 8; + out[2] = c >> 16; + out += 3; + } + } + + _mm_sfence(); + +} + +#endif diff --git a/src/AV/FastScaler_Scale.h b/src/AV/FastScaler_Scale.h index c6cc3787..f6dac95e 100644 --- a/src/AV/FastScaler_Scale.h +++ b/src/AV/FastScaler_Scale.h @@ -27,3 +27,7 @@ void Scale_BGRA_Fallback(unsigned int in_w, unsigned int in_h, const uint8_t* in void Scale_BGRA_SSSE3(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride, unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride); #endif +#if SSR_USE_LOONGARCH_ASM +void Scale_BGRA_LSX(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride, + unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride); +#endif diff --git a/src/AV/FastScaler_Scale_LSX.cpp b/src/AV/FastScaler_Scale_LSX.cpp new file mode 100644 index 00000000..07a2a801 --- /dev/null +++ b/src/AV/FastScaler_Scale_LSX.cpp @@ -0,0 +1,342 @@ +/* +Copyright (c) 2012-2024 Maarten Baert + +This file is part of SimpleScreenRecorder. + +SimpleScreenRecorder is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +SimpleScreenRecorder is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with SimpleScreenRecorder. If not, see . +*/ + +#include "FastScaler_Scale.h" + +#include "FastScaler_Scale_Generic.h" +#include "TempBuffer.h" + +#if SSR_USE_LOONGARCH_ASM + +#ifndef SIMDE_ENABLE_NATIVE_ALIASES +#define SIMDE_ENABLE_NATIVE_ALIASES +#endif +//using simde to translate SSSE3 to LSX(loongarch 128-bit simd) +#include + +/* +==== SSSE3 MipMapper ==== + +Very similar to the fallback mipmapper. There are three different SSSE3 kernels depending on the horizontal mipmap factor (mx). +The principle is the same as with 'wannabe-SIMD', but here we want to use larger reads/writes so horizontal addition is used. +This complicates the loops a lot and this is the reason why there are three different kernels: the first one has no horizontal addition, +the second one has one horizontal addition, and the third one has three horizontal additions. The horizontal additions are slower and not associative, +so they are avoided as much as possible by delaying them until the end. + +The remainders (edges of the image that require special attention) don't use SSSE3 because it's not worth it. + +You won't see huge improvements compared to the fallback mipmapper, since both algorithms are usually limited by the memory bandwidth. + +It's important that this function is force-inlined because this allows the compiler to eliminate the inner loops for common mipmap factors. +*/ + +//void MipMap_BGRA_SSSE3_Dynamic(...) +inline __attribute__((always_inline)) +void MipMap_BGRA_LSX_Dynamic(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride, + uint8_t* out_data, int out_stride, unsigned int mx, unsigned int my) { + assert((uintptr_t) out_data % 16 == 0 && out_stride % 16 == 0); + __m128i v_mask = _mm_set1_epi16(0xff); + __m128i v_offset = _mm_set1_epi16(1u << (mx + my - 1)); + const uint64_t mask = vec4x16(0xff); + const uint64_t offset = vec4x16(1u << (mx + my - 1)); + unsigned int wrem = in_w & ((1u << mx) - 1); + unsigned int hrem = in_h & ((1u << my) - 1); + for(unsigned int out_j = 0; out_j < (in_h >> my); ++out_j) { + const uint32_t *in = (const uint32_t*) (in_data + in_stride * (int) (out_j << my)); + uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j); + unsigned int blockrem; + if(mx == 0) { + for(unsigned int out_i = 0; out_i < (in_w >> (mx + 2)); ++out_i) { + __m128i sum1br = _mm_setzero_si128(), sum1ga = _mm_setzero_si128(); + const uint32_t *in2 = in; + for(unsigned int mj = 0; mj < (1u << my); ++mj) { + __m128i c1 = _mm_loadu_si128((__m128i*) in2); + sum1br = _mm_add_epi16(sum1br, _mm_and_si128(c1, v_mask)); + sum1ga = _mm_add_epi16(sum1ga, _mm_and_si128(_mm_srli_si128(c1, 1), v_mask)); + in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride); + } + in += 4; + __m128i qbr = _mm_srli_epi16(_mm_add_epi16(sum1br, v_offset), my); + __m128i qga = _mm_srli_epi16(_mm_add_epi16(sum1ga, v_offset), my); + _mm_stream_si128((__m128i*) out, _mm_or_si128(qbr, _mm_slli_si128(qga, 1))); + out += 4; + } + blockrem = (in_w >> mx) & 3; + } else if(mx == 1) { + for(unsigned int out_i = 0; out_i < (in_w >> (mx + 2)); ++out_i) { + __m128i sum1br = _mm_setzero_si128(), sum1ga = _mm_setzero_si128(), sum2br = _mm_setzero_si128(), sum2ga = _mm_setzero_si128(); + const uint32_t *in2 = in; + for(unsigned int mj = 0; mj < (1u << my); ++mj) { + __m128i c1 = _mm_loadu_si128((__m128i*) in2); + __m128i c2 = _mm_loadu_si128((__m128i*) (in2 + 4)); + sum1br = _mm_add_epi16(sum1br, _mm_and_si128(c1, v_mask)); + sum1ga = _mm_add_epi16(sum1ga, _mm_and_si128(_mm_srli_si128(c1, 1), v_mask)); + sum2br = _mm_add_epi16(sum2br, _mm_and_si128(c2, v_mask)); + sum2ga = _mm_add_epi16(sum2ga, _mm_and_si128(_mm_srli_si128(c2, 1), v_mask)); + in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride); + } + in += 8; + __m128i qbr = _mm_srli_epi16(_mm_add_epi16(_mm_hadd_epi32(sum1br, sum2br), v_offset), 1 + my); + __m128i qga = _mm_srli_epi16(_mm_add_epi16(_mm_hadd_epi32(sum1ga, sum2ga), v_offset), 1 + my); + _mm_stream_si128((__m128i*) out, _mm_or_si128(qbr, _mm_slli_si128(qga, 1))); + out += 4; + } + blockrem = (in_w >> mx) & 3; + } else { + for(unsigned int out_i = 0; out_i < (in_w >> (mx + 1)); ++out_i) { + __m128i sum1br = _mm_setzero_si128(), sum1ga = _mm_setzero_si128(), sum2br = _mm_setzero_si128(), sum2ga = _mm_setzero_si128(); + const uint32_t *in2 = in; + for(unsigned int mj = 0; mj < (1u << my); ++mj) { + for(unsigned int mi = 0; mi < (1u << (mx - 2)); ++mi) { + __m128i c1 = _mm_loadu_si128((__m128i*) (in2 + mi * 4)); + sum1br = _mm_add_epi16(sum1br, _mm_and_si128(c1, v_mask)); + sum1ga = _mm_add_epi16(sum1ga, _mm_and_si128(_mm_srli_si128(c1, 1), v_mask)); + } + for(unsigned int mi = (1u << (mx - 2)); mi < (1u << (mx - 1)); ++mi) { + __m128i c2 = _mm_loadu_si128((__m128i*) (in2 + mi * 4)); + sum2br = _mm_add_epi16(sum2br, _mm_and_si128(c2, v_mask)); + sum2ga = _mm_add_epi16(sum2ga, _mm_and_si128(_mm_srli_si128(c2, 1), v_mask)); + } + in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride); + } + in += (1u << (mx + 1)); + __m128i q = _mm_srli_epi16(_mm_add_epi16(_mm_hadd_epi32(_mm_hadd_epi32(sum1br, sum2br), _mm_hadd_epi32(sum1ga, sum2ga)), v_offset), mx + my); +#if defined(__x86_64__) && TEST_GCC_VERSION(4, 8) + _mm_stream_si64((long long*) out, _mm_cvtsi128_si64(_mm_or_si128(q, _mm_srli_si128(q, 7)))); +#else + _mm_storel_epi64((__m128i*) out, _mm_or_si128(q, _mm_srli_si128(q, 7))); +#endif + out += 2; + } + blockrem = (in_w >> mx) & 1; + } + for(unsigned int out_i = 0; out_i < blockrem; ++out_i) { + uint64_t sum = 0; + const uint32_t *in2 = in; + for(unsigned int mj = 0; mj < (1u << my); ++mj) { + for(unsigned int mi = 0; mi < (1u << mx); ++mi) { + uint64_t c = in2[mi]; + sum += ((c << 24) | c) & mask; + } + in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride); + } + in += (1u << mx); + uint64_t q = ((sum + offset) >> (mx + my)) & mask; + *(out++) = ((uint32_t) (q >> 24)) | ((uint32_t) q); + } + if(wrem != 0) { + uint64_t sum = 0; + const uint32_t *in2 = in; + for(unsigned int mj = 0; mj < (1u << my); ++mj) { + for(unsigned int mi = 0; mi < wrem - 1; ++mi) { + uint64_t c = in2[mi]; + sum += ((c << 24) | c) & mask; + } + uint64_t c = in2[wrem - 1]; + sum += (((c << 24) | c) & mask) * ((1u << mx) - (wrem - 1)); + in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride); + } + uint64_t q = ((sum + offset) >> (mx + my)) & mask; + *out = ((uint32_t) (q >> 24)) | ((uint32_t) q); + } + } + if(hrem != 0) { + unsigned int out_j = in_h >> my; + const uint32_t *in = (const uint32_t*) (in_data + in_stride * (int) (out_j << my)); + uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j); + for(unsigned int out_i = 0; out_i < (in_w >> mx); ++out_i) { + uint64_t sum = 0; + const uint32_t *in2 = in; + for(unsigned int mj = 0; mj < hrem - 1; ++mj) { + for(unsigned int mi = 0; mi < (1u << mx); ++mi) { + uint64_t c = in2[mi]; + sum += ((c << 24) | c) & mask; + } + in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride); + } + for(unsigned int mi = 0; mi < (1u << mx); ++mi) { + uint64_t c = in2[mi]; + sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1)); + } + in += (1u << mx); + uint64_t q = ((sum + offset) >> (mx + my)) & mask; + *(out++) = ((uint32_t) (q >> 24)) | ((uint32_t) q); + } + if(wrem != 0) { + uint64_t sum = 0; + const uint32_t *in2 = in; + for(unsigned int mj = 0; mj < hrem - 1; ++mj) { + for(unsigned int mi = 0; mi < wrem - 1; ++mi) { + uint64_t c = in2[mi]; + sum += ((c << 24) | c) & mask; + } + uint64_t c = in2[wrem - 1]; + sum += (((c << 24) | c) & mask) * ((1u << mx) - (wrem - 1)); + in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride); + } + for(unsigned int mi = 0; mi < wrem - 1; ++mi) { + uint64_t c = in2[mi]; + sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1)); + } + uint64_t c = in2[wrem - 1]; + sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1)) * ((1u << mx) - (wrem - 1)); + uint64_t q = ((sum + offset) >> (mx + my)) & mask; + *out = ((uint32_t) (q >> 24)) | ((uint32_t) q); + } + } + _mm_sfence(); +} + +//void void Scale_BGRA_SSSE3(...) +void MipMap_BGRA_LSX(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride, + uint8_t* out_data, int out_stride, unsigned int mx, unsigned int my) { + assert(mx + my <= 8); + switch((mx << 4) | my) { + case 0x00: assert(false); break; + case 0x01: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 0, 1); break; + case 0x02: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 0, 2); break; + case 0x03: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 0, 3); break; + case 0x10: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 0); break; + case 0x11: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 1); break; + case 0x12: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 2); break; + case 0x13: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 3); break; + case 0x20: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 0); break; + case 0x21: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 1); break; + case 0x22: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 2); break; + case 0x23: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 3); break; + case 0x30: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 0); break; + case 0x31: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 1); break; + case 0x32: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 2); break; + case 0x33: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 3); break; + default: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, mx, my); break; + } +} + +/* +==== SSSE3 Bilinear Scaler ==== + +Same principle as the fallback scaler, but this version produces two pixels per iteration. That means it can read 64-bit blocks and write 64-bit blocks, +and the shuffles are also more efficient than just shifting. +*/ + +//void Bilinear_BGRA_SSSE3(...) +void Bilinear_BGRA_LSX(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride, + unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride, + unsigned int mx, unsigned int my) { + assert(in_w > 1 && in_h > 1); //TODO// support size 1? + assert(out_w > 1 && out_h > 1); //TODO// support size 1? + assert(in_w < (1 << 28) && in_h < (1 << 28)); + assert(out_w < (1 << 28) && out_h < (1 << 28)); + assert((uintptr_t) out_data % 16 == 0 && out_stride % 16 == 0); + + // precompute horizontal offsets and fractions + TempBuffer x_offset_table; + TempBuffer x_fraction_table; + x_offset_table.Alloc(out_w); + x_fraction_table.Alloc(out_w); + for(unsigned int out_i = 0; out_i < out_w; ++out_i) { + unsigned int x_fraction; + Bilinear_MapIndex(out_i, in_w, out_w, mx, x_offset_table[out_i], x_fraction); + x_fraction_table[out_i] = ((uint64_t) x_fraction << 48) | ((uint64_t) x_fraction << 32) | ((uint64_t) x_fraction << 16) | ((uint64_t) x_fraction); + } + + // constants + __m128i v_128 = _mm_set1_epi16(128); + __m128i v_256 = _mm_set1_epi16(256); + __m128i v_shuffle1 = _mm_setr_epi8( 0, -1, 1, -1, 2, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i v_shuffle2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 1, -1, 2, -1, 3, -1); + __m128i v_shuffle3 = _mm_setr_epi8( 4, -1, 5, -1, 6, -1 , 7, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i v_shuffle4 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 4, -1, 5, -1, 6, -1, 7, -1); + __m128i v_shuffle5 = _mm_setr_epi8( 1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1); + + // scale + for(unsigned int out_j = 0; out_j < out_h; ++out_j) { + unsigned int y_offset, y_fraction; + Bilinear_MapIndex(out_j, in_h, out_h, my, y_offset, y_fraction); + __m128i vy_fraction = _mm_set1_epi16(y_fraction); + __m128i vy_fraction_inv = _mm_sub_epi16(v_256, vy_fraction); + unsigned int *x_offset_ptr = x_offset_table.GetData(); + uint64_t *x_fraction_ptr = x_fraction_table.GetData(); + const uint32_t *in1 = (const uint32_t*) (in_data + in_stride * (int) y_offset); + const uint32_t *in2 = (const uint32_t*) (in_data + in_stride * ((int) y_offset + 1)); + uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j); + for(unsigned int out_i = 0; out_i < out_w / 2; ++out_i) { + + unsigned int x_offset1 = x_offset_ptr[0]; + unsigned int x_offset2 = x_offset_ptr[1]; + __m128i vx_fraction = _mm_load_si128((__m128i*) x_fraction_ptr); + __m128i vx_fraction_inv = _mm_sub_epi16(v_256, vx_fraction); + x_offset_ptr += 2; + x_fraction_ptr += 2; + + __m128i c1a = _mm_loadl_epi64((__m128i*) (in1 + x_offset1)); + __m128i c2a = _mm_loadl_epi64((__m128i*) (in1 + x_offset2)); + __m128i c1b = _mm_loadl_epi64((__m128i*) (in2 + x_offset1)); + __m128i c2b = _mm_loadl_epi64((__m128i*) (in2 + x_offset2)); + + //_mm_prefetch(in1 + x_offset2 + 64, _MM_HINT_T0); + //_mm_prefetch(in2 + x_offset2 + 64, _MM_HINT_T0); + + __m128i p1 = _mm_or_si128(_mm_shuffle_epi8(c1a, v_shuffle1), _mm_shuffle_epi8(c2a, v_shuffle2)); + __m128i p2 = _mm_or_si128(_mm_shuffle_epi8(c1a, v_shuffle3), _mm_shuffle_epi8(c2a, v_shuffle4)); + __m128i q1 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(p1, vx_fraction_inv), v_128), _mm_mullo_epi16(p2, vx_fraction)), 8); + + __m128i p3 = _mm_or_si128(_mm_shuffle_epi8(c1b, v_shuffle1), _mm_shuffle_epi8(c2b, v_shuffle2)); + __m128i p4 = _mm_or_si128(_mm_shuffle_epi8(c1b, v_shuffle3), _mm_shuffle_epi8(c2b, v_shuffle4)); + __m128i q2 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(p3, vx_fraction_inv), v_128), _mm_mullo_epi16(p4, vx_fraction)), 8); + + __m128i r = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(q1, vy_fraction_inv), v_128), _mm_mullo_epi16(q2, vy_fraction)); + + _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi8(r, v_shuffle5)); + out += 2; + + } + if(out_w & 1) { + + unsigned int x_offset1 = x_offset_ptr[0]; + __m128i vx_fraction = _mm_loadl_epi64((__m128i*) x_fraction_ptr); + __m128i vx_fraction_inv = _mm_sub_epi16(v_256, vx_fraction); + + __m128i c1a = _mm_loadl_epi64((__m128i*) (in1 + x_offset1)); + __m128i c1b = _mm_loadl_epi64((__m128i*) (in2 + x_offset1)); + + __m128i p1 = _mm_shuffle_epi8(c1a, v_shuffle1); + __m128i p2 = _mm_shuffle_epi8(c1a, v_shuffle3); + __m128i q1 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(p1, vx_fraction_inv), v_128), _mm_mullo_epi16(p2, vx_fraction)), 8); + + __m128i p3 = _mm_shuffle_epi8(c1b, v_shuffle1); + __m128i p4 = _mm_shuffle_epi8(c1b, v_shuffle3); + __m128i q2 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(p3, vx_fraction_inv), v_128), _mm_mullo_epi16(p4, vx_fraction)), 8); + + __m128i r = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(q1, vy_fraction_inv), v_128), _mm_mullo_epi16(q2, vy_fraction)); + + *out = _mm_cvtsi128_si32(_mm_shuffle_epi8(r, v_shuffle5)); + + } + } + +} + +//void Scale_BGRA_SSSE3(...) +void Scale_BGRA_LSX(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride, + unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride) { + Scale_BGRA_Generic(in_w, in_h, in_data, in_stride, out_w, out_h, out_data, out_stride, MipMap_BGRA_LSX, Bilinear_BGRA_LSX); +} + +#endif diff --git a/src/Benchmark.cpp b/src/Benchmark.cpp index ae1ad518..af5426f1 100644 --- a/src/Benchmark.cpp +++ b/src/Benchmark.cpp @@ -114,7 +114,7 @@ void BenchmarkScale(unsigned int in_w, unsigned int in_h, unsigned int out_w, un } // run test - unsigned int time_swscale = 0, time_fallback = 0, time_ssse3 = 0; + unsigned int time_swscale = 0, time_fallback = 0, time_ssse3 = 0, time_lsx = 0; { SwsContext *sws = sws_getCachedContext(NULL, in_w, in_h, AV_PIX_FMT_BGRA, @@ -158,15 +158,28 @@ void BenchmarkScale(unsigned int in_w, unsigned int in_h, unsigned int out_w, un time_ssse3 = (t2 - t1) / run_size; } #endif +#if SSR_USE_LOONGARCH_ASM + if(CPUFeatures::HasLSX()) { + int64_t t1 = hrt_time_micro(); + for(unsigned int i = 0; i < run_size; ++i) { + unsigned int ii = i % queue_size; + Scale_BGRA_LSX(in_w, in_h, queue_in[ii]->m_data[0], queue_in[ii]->m_stride[0], + out_w, out_h, queue_out[ii]->m_data[0], queue_out[ii]->m_stride[0]); + } + int64_t t2 = hrt_time_micro(); + time_lsx = (t2 - t1) / run_size; + } +#endif // print result QString in_size = QString("%1x%2").arg(in_w).arg(in_h); QString out_size = QString("%1x%2").arg(out_w).arg(out_h); - Logger::LogInfo("[BenchmarkScale] " + Logger::tr("BGRA %1 to BGRA %2 | SWScale %3 us | Fallback %4 us (%5%) | SSSE3 %6 us (%7%)") + Logger::LogInfo("[BenchmarkScale] " + Logger::tr("BGRA %1 to BGRA %2 | SWScale %3 us | Fallback %4 us (%5%) | SSSE3 %6 us (%7%) | LSX %8 us (%9%)") .arg(in_size, 9).arg(out_size, 9) .arg(time_swscale, 6) .arg(time_fallback, 6).arg(100 * time_fallback / time_swscale, 3) - .arg(time_ssse3, 6).arg(100 * time_ssse3 / time_fallback, 3)); + .arg(time_ssse3, 6).arg(100 * time_ssse3 / time_fallback, 3) + .arg(time_lsx, 6).arg(100 * time_lsx / time_fallback, 3)); } @@ -174,6 +187,9 @@ void BenchmarkConvert(unsigned int w, unsigned int h, AVPixelFormat in_format, A #if SSR_USE_X86_ASM , ConvertFunc ssse3 #endif +#if SSR_USE_LOONGARCH_ASM +, ConvertFunc lsx +#endif ) { std::mt19937 rng(12345); @@ -195,7 +211,7 @@ void BenchmarkConvert(unsigned int w, unsigned int h, AVPixelFormat in_format, A } // run test - unsigned int time_swscale = 0, time_fallback = 0, time_ssse3 = 0; + unsigned int time_swscale = 0, time_fallback = 0, time_ssse3 = 0, time_lsx = 0; { SwsContext *sws = sws_getCachedContext(NULL, w, h, in_format, @@ -237,14 +253,26 @@ void BenchmarkConvert(unsigned int w, unsigned int h, AVPixelFormat in_format, A time_ssse3 = (t2 - t1) / run_size; } #endif +#if SSR_USE_LOONGARCH_ASM + if(CPUFeatures::HasLSX()) { + int64_t t1 = hrt_time_micro(); + for(unsigned int i = 0; i < run_size; ++i) { + unsigned int ii = i % queue_size; + lsx(w, h, queue_in[ii]->m_data[0], queue_in[ii]->m_stride[0], queue_out[ii]->m_data.data(), queue_out[ii]->m_stride.data()); + } + int64_t t2 = hrt_time_micro(); + time_lsx = (t2 - t1) / run_size; + } +#endif // print result QString size = QString("%1x%2").arg(w).arg(h); - Logger::LogInfo("[BenchmarkConvert] " + Logger::tr("%1 %2 to %3 %4 | SWScale %5 us | Fallback %6 us (%7%) | SSSE3 %8 us (%9%)") + Logger::LogInfo("[BenchmarkConvert] " + Logger::tr("%1 %2 to %3 %4 | SWScale %5 us | Fallback %6 us (%7%) | SSSE3 %8 us (%9%) | LSX %10 us (%11%)") .arg(in_format_name).arg(size, 9).arg(out_format_name).arg(size, 9) .arg(time_swscale, 6) .arg(time_fallback, 6).arg(100 * time_fallback / time_swscale, 3) - .arg(time_ssse3, 6).arg(100 * time_ssse3 / time_fallback, 3)); + .arg(time_ssse3, 6).arg(100 * time_ssse3 / time_fallback, 3) + .arg(time_lsx, 6).arg(100 * time_lsx / time_fallback, 3)); } @@ -264,6 +292,12 @@ void Benchmark() { BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_YUV420P, "BGRA", "YUV420", NewImageBGRA, NewImageYUV420, Convert_BGRA_YUV420_Fallback , Convert_BGRA_YUV420_SSSE3 ); BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_NV12 , "BGRA", "NV12 ", NewImageBGRA, NewImageNV12 , Convert_BGRA_NV12_Fallback , Convert_BGRA_NV12_SSSE3 ); BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_BGR24 , "BGRA", "BGR ", NewImageBGRA, NewImageBGR , PlaneWrapper, PlaneWrapper); +#elif SSR_USE_LOONGARCH_ASM + BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_YUV444P, "BGRA", "YUV444", NewImageBGRA, NewImageYUV444, Convert_BGRA_YUV444_Fallback , Convert_BGRA_YUV444_LSX ); + BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_YUV422P, "BGRA", "YUV422", NewImageBGRA, NewImageYUV422, Convert_BGRA_YUV422_Fallback , Convert_BGRA_YUV422_LSX ); + BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_YUV420P, "BGRA", "YUV420", NewImageBGRA, NewImageYUV420, Convert_BGRA_YUV420_Fallback , Convert_BGRA_YUV420_LSX ); + BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_NV12 , "BGRA", "NV12 ", NewImageBGRA, NewImageNV12 , Convert_BGRA_NV12_Fallback , Convert_BGRA_NV12_LSX ); + BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_BGR24 , "BGRA", "BGR ", NewImageBGRA, NewImageBGR , PlaneWrapper, PlaneWrapper); #else BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_YUV444P, "BGRA", "YUV444", NewImageBGRA, NewImageYUV444, Convert_BGRA_YUV444_Fallback ); BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_YUV422P, "BGRA", "YUV422", NewImageBGRA, NewImageYUV422, Convert_BGRA_YUV422_Fallback ); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e64d1748..e92a5165 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -32,6 +32,10 @@ else() find_package(Qt4 4.8 COMPONENTS QtGui REQUIRED) endif() +if(WITH_SIMDE) + find_package(SIMDE 0.8.3 REQUIRED) +endif() + set(sources AV/Input/ALSAInput.cpp AV/Input/ALSAInput.h @@ -164,6 +168,23 @@ if(ENABLE_X86_ASM) endif() +if(ENABLE_LOONGARCH_ASM) + + list(APPEND sources + AV/FastResampler_FirFilter_LSX.cpp + AV/FastScaler_Convert_LSX.cpp + AV/FastScaler_Scale_LSX.cpp + ) + + set_source_files_properties( + AV/FastResampler_FirFilter_LSX.cpp + AV/FastScaler_Convert_LSX.cpp + AV/FastScaler_Scale_LSX.cpp + PROPERTIES COMPILE_FLAGS -mlsx + ) + +endif() + set(res_input ../data/resources/resources.qrc ) @@ -235,6 +256,7 @@ target_link_libraries(simplescreenrecorder PRIVATE target_compile_definitions(simplescreenrecorder PRIVATE -DSSR_USE_X86_ASM=$ + -DSSR_USE_LOONGARCH_ASM=$,$>> -DSSR_USE_FFMPEG_VERSIONS=$ -DSSR_USE_JACK_METADATA=$ -DSSR_USE_OPENGL_RECORDING=$ diff --git a/src/Main.cpp b/src/Main.cpp index 4afeae20..436c7edf 100644 --- a/src/Main.cpp +++ b/src/Main.cpp @@ -99,7 +99,7 @@ int main(int argc, char* argv[]) { Logger::LogInfo("==================== " + Logger::tr("SSR started") + " ===================="); Logger::LogInfo(GetVersionInfo()); -#if SSR_USE_X86_ASM +#if SSR_USE_X86_ASM || SSR_USE_LOONGARCH_ASM // detect CPU features CPUFeatures::Detect(); #endif diff --git a/src/common/CPUFeatures.cpp b/src/common/CPUFeatures.cpp index 04bfa1a8..2786aa0d 100644 --- a/src/common/CPUFeatures.cpp +++ b/src/common/CPUFeatures.cpp @@ -74,3 +74,28 @@ void CPUFeatures::Detect() { } #endif // SSR_USE_X86_ASM + +#if SSR_USE_LOONGARCH_ASM + +#include + +#define LA_HWCAP_LSX (1<<4) +#define LA_HWCAP_LASX (1<<5) + +bool CPUFeatures::s_lsx = false; +bool CPUFeatures::s_lasx = false; + +void CPUFeatures::Detect() { + + QString str = "[CPUFeatures::Detect] " + Logger::tr("CPU features") + ":"; + + int flags = 0; + int flag = (int)getauxval(AT_HWCAP); + + if (flag & LA_HWCAP_LSX) {s_lsx = true; str += " lsx";} + if (flag & LA_HWCAP_LASX) {s_lasx = true; str += " lasx";} + + Logger::LogInfo(str); +} + +#endif // SSR_USE_LOONGARCH_ASM diff --git a/src/common/CPUFeatures.h b/src/common/CPUFeatures.h index 728503cf..b05bcb1e 100644 --- a/src/common/CPUFeatures.h +++ b/src/common/CPUFeatures.h @@ -48,3 +48,20 @@ class CPUFeatures { }; #endif // SSR_USE_X86_ASM + +#if SSR_USE_LOONGARCH_ASM + +class CPUFeatures { + +private: + static bool s_lsx, s_lasx; + +public: + static void Detect(); + + inline static bool HasLSX() { return s_lsx; } + inline static bool HasLASX() { return s_lasx; } + +}; + +#endif // SSR_USE_LOONGARCH_ASM