From cdd7a7269a871857019a2db289f25c8b85507f49 Mon Sep 17 00:00:00 2001
From: jinbo <jinbo@loongson.cn>
Date: Thu, 5 Dec 2024 17:28:45 +0800
Subject: [PATCH 1/2] Fix typo in src/GUI/PageInput.h

---
 src/GUI/PageInput.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GUI/PageInput.h b/src/GUI/PageInput.h
index e4789549..107bbc59 100644
--- a/src/GUI/PageInput.h
+++ b/src/GUI/PageInput.h
@@ -301,7 +301,7 @@ private slots:
 	inline unsigned int GetVideoV4L2Width() { return m_spinbox_video_v4l2_width->value(); }
 	inline unsigned int GetVideoV4L2Height() { return m_spinbox_video_v4l2_height->value(); }
 #endif
-#if SSR_USE_V4L2
+#if SSR_USE_PIPEWIRE
 	inline QString GetVideoPipeWireSource() { return m_lineedit_video_pipewire_source->text(); }
 	inline unsigned int GetVideoPipeWireWidth() { return m_spinbox_video_pipewire_width->value(); }
 	inline unsigned int GetVideoPipeWireHeight() { return m_spinbox_video_pipewire_height->value(); }

From 7d8a377c1e04c414b2b61364a858abe1a6d32ed9 Mon Sep 17 00:00:00 2001
From: jinbo <jinbo@loongson.cn>
Date: Tue, 10 Dec 2024 15:13:34 +0800
Subject: [PATCH 2/2] Use simde to accelerate for loongarch

We added simd accelerate for scale, converter and filter by
using simde to translate sse intrinsics to lsx intrinsics(
which is loongarch platform 128-bit simd instruction sets).

simde introduction:
https://github.com/simd-everywhere/simde
---
 CMakeLists.txt                         |   5 +
 cmake/FindSIMDE.cmake                  |  13 +
 src/AV/FastResampler.cpp               |  11 +-
 src/AV/FastResampler_FirFilter.h       |   6 +
 src/AV/FastResampler_FirFilter_LSX.cpp |  90 ++++++
 src/AV/FastScaler.cpp                  |  48 +++
 src/AV/FastScaler_Convert.h            |   8 +
 src/AV/FastScaler_Convert_LSX.cpp      | 428 +++++++++++++++++++++++++
 src/AV/FastScaler_Scale.h              |   4 +
 src/AV/FastScaler_Scale_LSX.cpp        | 342 ++++++++++++++++++++
 src/Benchmark.cpp                      |  46 ++-
 src/CMakeLists.txt                     |  22 ++
 src/Main.cpp                           |   2 +-
 src/common/CPUFeatures.cpp             |  25 ++
 src/common/CPUFeatures.h               |  17 +
 15 files changed, 1059 insertions(+), 8 deletions(-)
 create mode 100644 cmake/FindSIMDE.cmake
 create mode 100644 src/AV/FastResampler_FirFilter_LSX.cpp
 create mode 100644 src/AV/FastScaler_Convert_LSX.cpp
 create mode 100644 src/AV/FastScaler_Scale_LSX.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 94f8ccd7..f9059983 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,12 +11,16 @@ project(simplescreenrecorder VERSION 0.4.4)
 
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686")
 	set(PROCESSOR_IS_X86 TRUE)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch")
+	set(PROCESSOR_IS_LOONGARCH TRUE)
 else()
 	set(PROCESSOR_IS_X86 FALSE)
+	set(PROCESSOR_IS_LOONGARCH FALSE)
 endif()
 
 option(ENABLE_32BIT_GLINJECT "Build the 32-bit version of 'libssr-glinject' on 64-bit systems (in addition to the 64-bit version). Required for OpenGL recording of 32-bit applications on 64-bit systems." FALSE)
 option(ENABLE_X86_ASM "Allow x86/x64 assembly or intrinsics." ${PROCESSOR_IS_X86})
+option(ENABLE_LOONGARCH_ASM "Allow loongarch assembly or intrinsics." ${PROCESSOR_IS_LOONGARCH})
 option(ENABLE_FFMPEG_VERSIONS "Use FFmpeg version numbers for feature support tests. Enable when using FFmpeg, disable when using Libav." TRUE)
 option(ENABLE_JACK_METADATA "Use the JACK metadata API. May not work with very old JACK versions." TRUE)
 option(WITH_OPENGL_RECORDING "Build with OpenGL recording support." TRUE)
@@ -28,6 +32,7 @@ option(WITH_JACK "Build with JACK support." TRUE)
 option(WITH_QT5 "Build with Qt5 (instead of Qt4)." FALSE)
 option(WITH_SIMPLESCREENRECORDER "Build the 'simplescreenrecorder' executable." TRUE)
 option(WITH_GLINJECT "Build the 'libssr-glinject' library. Required for OpenGL recording." TRUE)
+option(WITH_SIMDE "Build with simde support." ${PROCESSOR_IS_LOONGARCH})
 
 set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
 
diff --git a/cmake/FindSIMDE.cmake b/cmake/FindSIMDE.cmake
new file mode 100644
index 00000000..927a2dfd
--- /dev/null
+++ b/cmake/FindSIMDE.cmake
@@ -0,0 +1,13 @@
+# rules for finding the SIMDE library
+
+find_package(PkgConfig REQUIRED)
+pkg_check_modules(PC_SIMDE simde)
+
+find_path(SIMDE_INCLUDE_DIR simde/x86/sse2.h simde/x86/ssse3.h HINTS ${PC_SIMDE_INCLUDEDIR} ${PC_SIMDE_INCLUDE_DIRS})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(SIMDE DEFAULT_MSG SIMDE_INCLUDE_DIR)
+
+mark_as_advanced(SIMDE_INCLUDE_DIR)
+
+set(SIMDE_INCLUDE_DIRS ${SIMDE_INCLUDE_DIR})
diff --git a/src/AV/FastResampler.cpp b/src/AV/FastResampler.cpp
index 81a62362..08bb4d0c 100644
--- a/src/AV/FastResampler.cpp
+++ b/src/AV/FastResampler.cpp
@@ -105,13 +105,22 @@ FastResampler::FastResampler(unsigned int channels, float gain) {
 			default: m_firfilter2_ptr = &FastResampler_FirFilter2_Cn_SSE2; break;
 		}
 	} else {
+#endif
+#if SSR_USE_LOONGARCH_ASM
+	if(CPUFeatures::HasLSX()) {
+		switch(m_channels) {
+			case 1:  m_firfilter2_ptr = &FastResampler_FirFilter2_C1_LSX; break;
+			case 2:  m_firfilter2_ptr = &FastResampler_FirFilter2_C2_LSX; break;
+			default: m_firfilter2_ptr = &FastResampler_FirFilter2_Cn_LSX; break;
+		}
+	} else {
 #endif
 		switch(m_channels) {
 			case 1:  m_firfilter2_ptr = &FastResampler_FirFilter2_C1_Fallback; break;
 			case 2:  m_firfilter2_ptr = &FastResampler_FirFilter2_C2_Fallback; break;
 			default: m_firfilter2_ptr = &FastResampler_FirFilter2_Cn_Fallback; break;
 		}
-#if SSR_USE_X86_ASM
+#if SSR_USE_X86_ASM || SSR_USE_LOONGARCH_ASM
 	}
 #endif
 
diff --git a/src/AV/FastResampler_FirFilter.h b/src/AV/FastResampler_FirFilter.h
index 3ab02ea5..90baf9ce 100644
--- a/src/AV/FastResampler_FirFilter.h
+++ b/src/AV/FastResampler_FirFilter.h
@@ -31,3 +31,9 @@ void FastResampler_FirFilter2_C1_SSE2(unsigned int channels, unsigned int filter
 void FastResampler_FirFilter2_C2_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output);
 void FastResampler_FirFilter2_Cn_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output);
 #endif
+
+#if SSR_USE_LOONGARCH_ASM
+void FastResampler_FirFilter2_C1_LSX(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output);
+void FastResampler_FirFilter2_C2_LSX(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output);
+void FastResampler_FirFilter2_Cn_LSX(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output);
+#endif
diff --git a/src/AV/FastResampler_FirFilter_LSX.cpp b/src/AV/FastResampler_FirFilter_LSX.cpp
new file mode 100644
index 00000000..79e9f9c1
--- /dev/null
+++ b/src/AV/FastResampler_FirFilter_LSX.cpp
@@ -0,0 +1,90 @@
+/*
+Copyright (c) 2012-2024 Maarten Baert <maarten-baert@hotmail.com>
+
+This file is part of SimpleScreenRecorder.
+
+SimpleScreenRecorder is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+SimpleScreenRecorder is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with SimpleScreenRecorder.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "FastResampler_FirFilter.h"
+
+#if SSR_USE_LOONGARCH_ASM
+
+#ifndef SIMDE_ENABLE_NATIVE_ALIASES
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#endif
+//using simde to translate SSE2 to LSX(loongarch 128-bit simd)
+#include<simde/x86/sse2.h>
+
+//void FastResampler_FirFilter2_C1_SSE2(...)
+void FastResampler_FirFilter2_C1_LSX(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) {
+	Q_UNUSED(channels);
+	__m128 sum = _mm_setzero_ps();
+	__m128 v_frac = _mm_set1_ps(frac);
+	for(unsigned int i = 0; i < filter_length / 4; ++i) {
+		__m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2);
+		coef1 += 4; coef2 += 4;
+		__m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac));
+		__m128 v_input = _mm_loadu_ps(input);
+		input += 4;
+		sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value));
+	}
+	__m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e));
+	__m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01));
+	_mm_store_ss(output, sum3);
+}
+
+//void FastResampler_FirFilter2_C2_SSE2(...)
+void FastResampler_FirFilter2_C2_LSX(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) {
+	Q_UNUSED(channels);
+	__m128 sum = _mm_setzero_ps();
+	__m128 v_frac = _mm_set1_ps(frac);
+	for(unsigned int i = 0; i < filter_length / 4; ++i) {
+		__m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2);
+		coef1 += 4; coef2 += 4;
+		__m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac));
+		__m128 v_input1 = _mm_loadu_ps(input), v_input2 = _mm_loadu_ps(input + 4);
+		input += 8;
+		sum = _mm_add_ps(sum, _mm_mul_ps(v_input1, _mm_unpacklo_ps(filter_value, filter_value)));
+		sum = _mm_add_ps(sum, _mm_mul_ps(v_input2, _mm_unpackhi_ps(filter_value, filter_value)));
+	}
+	__m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0xee));
+	_mm_store_sd((double*) output, _mm_castps_pd(sum2));
+}
+
+//void FastResampler_FirFilter2_Cn_SSE2(...)
+void FastResampler_FirFilter2_Cn_LSX(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) {
+	Q_UNUSED(channels);
+	for(unsigned int c = 0; c < channels; ++c) {
+		__m128 sum = _mm_setzero_ps();
+		__m128 v_frac = _mm_set1_ps(frac);
+		float *input2 = input + c;
+		for(unsigned int i = 0; i < filter_length / 4; ++i) {
+			__m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2);
+			coef1 += 4; coef2 += 4;
+			__m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac));
+			__m128 v_input1 = _mm_load_ss(input2); input2 += channels;
+			__m128 v_input2 = _mm_load_ss(input2); input2 += channels;
+			__m128 v_input3 = _mm_load_ss(input2); input2 += channels;
+			__m128 v_input4 = _mm_load_ss(input2); input2 += channels;
+			__m128 v_input = _mm_movelh_ps(_mm_unpacklo_ps(v_input1, v_input2), _mm_unpacklo_ps(v_input3, v_input4));
+			sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value));
+		}
+		__m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e));
+		__m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01));
+		_mm_store_ss(output + c, sum3);
+	}
+}
+
+#endif
diff --git a/src/AV/FastScaler.cpp b/src/AV/FastScaler.cpp
index 3ef7ecb2..462b48c2 100644
--- a/src/AV/FastScaler.cpp
+++ b/src/AV/FastScaler.cpp
@@ -165,6 +165,14 @@ void FastScaler::Convert_BGRA_YUV444(unsigned int width, unsigned int height, co
 		return;
 	}
 #endif
+#if SSR_USE_LOONGARCH_ASM
+	if(CPUFeatures::HasLSX()) {
+		Convert_BGRA_YUV444_LSX(width, height, in_data, in_stride, out_data, out_stride);
+	} else {
+		Convert_BGRA_YUV444_Fallback(width, height, in_data, in_stride, out_data, out_stride);
+	}
+	return;
+#endif
 
 	Convert_BGRA_YUV444_Fallback(width, height, in_data, in_stride, out_data, out_stride);
 
@@ -190,6 +198,14 @@ void FastScaler::Convert_BGRA_YUV422(unsigned int width, unsigned int height, co
 		return;
 	}
 #endif
+#if SSR_USE_LOONGARCH_ASM
+	if(CPUFeatures::HasLSX()) {
+		Convert_BGRA_YUV422_LSX(width, height, in_data, in_stride, out_data, out_stride);
+	} else {
+		Convert_BGRA_YUV422_Fallback(width, height, in_data, in_stride, out_data, out_stride);
+	}
+	return;
+#endif
 
 	Convert_BGRA_YUV422_Fallback(width, height, in_data, in_stride, out_data, out_stride);
 
@@ -215,6 +231,14 @@ void FastScaler::Convert_BGRA_YUV420(unsigned int width, unsigned int height, co
 		return;
 	}
 #endif
+#if SSR_USE_LOONGARCH_ASM
+	if(CPUFeatures::HasLSX()) {
+		Convert_BGRA_YUV420_LSX(width, height, in_data, in_stride, out_data, out_stride);
+	} else {
+		Convert_BGRA_YUV420_Fallback(width, height, in_data, in_stride, out_data, out_stride);
+	}
+	return;
+#endif
 
 	Convert_BGRA_YUV420_Fallback(width, height, in_data, in_stride, out_data, out_stride);
 
@@ -239,6 +263,14 @@ void FastScaler::Convert_BGRA_NV12(unsigned int width, unsigned int height, cons
 		return;
 	}
 #endif
+#if SSR_USE_LOONGARCH_ASM
+	if(CPUFeatures::HasLSX()) {
+		Convert_BGRA_NV12_LSX(width, height, in_data, in_stride, out_data, out_stride);
+	} else {
+		Convert_BGRA_NV12_Fallback(width, height, in_data, in_stride, out_data, out_stride);
+	}
+	return;
+#endif
 
 	Convert_BGRA_NV12_Fallback(width, height, in_data, in_stride, out_data, out_stride);
 
@@ -261,6 +293,14 @@ void FastScaler::Convert_BGRA_BGR(unsigned int width, unsigned int height, const
 		return;
 	}
 #endif
+#if SSR_USE_LOONGARCH_ASM
+	if(CPUFeatures::HasLSX()) {
+		Convert_BGRA_BGR_LSX(width, height, in_data, in_stride, out_data, out_stride);
+	} else {
+		Convert_BGRA_BGR_Fallback(width, height, in_data, in_stride, out_data, out_stride);
+	}
+	return;
+#endif
 
 	Convert_BGRA_BGR_Fallback(width, height, in_data, in_stride, out_data, out_stride);
 
@@ -284,6 +324,14 @@ void FastScaler::Scale_BGRA(unsigned int in_width, unsigned int in_height, const
 		return;
 	}
 #endif
+#if SSR_USE_LOONGARCH_ASM
+	if(CPUFeatures::HasLSX()) {
+		Scale_BGRA_LSX(in_width, in_height, in_data, in_stride, out_width, out_height, out_data, out_stride);
+	} else {
+		Scale_BGRA_Fallback(in_width, in_height, in_data, in_stride, out_width, out_height, out_data, out_stride);
+	}
+	return;
+#endif
 
 	Scale_BGRA_Fallback(in_width, in_height, in_data, in_stride, out_width, out_height, out_data, out_stride);
 
diff --git a/src/AV/FastScaler_Convert.h b/src/AV/FastScaler_Convert.h
index 91dec7fb..6d10c4d1 100644
--- a/src/AV/FastScaler_Convert.h
+++ b/src/AV/FastScaler_Convert.h
@@ -33,3 +33,11 @@ void Convert_BGRA_YUV420_SSSE3(unsigned int w, unsigned int h, const uint8_t* in
 void Convert_BGRA_NV12_SSSE3(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[2], const int out_stride[2]);
 void Convert_BGRA_BGR_SSSE3(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* out_data, int out_stride);
 #endif
+
+#if SSR_USE_LOONGARCH_ASM
+void Convert_BGRA_YUV444_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]);
+void Convert_BGRA_YUV422_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]);
+void Convert_BGRA_YUV420_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]);
+void Convert_BGRA_NV12_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[2], const int out_stride[2]);
+void Convert_BGRA_BGR_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* out_data, int out_stride);
+#endif
diff --git a/src/AV/FastScaler_Convert_LSX.cpp b/src/AV/FastScaler_Convert_LSX.cpp
new file mode 100644
index 00000000..cf9ffa57
--- /dev/null
+++ b/src/AV/FastScaler_Convert_LSX.cpp
@@ -0,0 +1,428 @@
+/*
+Copyright (c) 2012-2024 Maarten Baert <maarten-baert@hotmail.com>
+
+This file is part of SimpleScreenRecorder.
+
+SimpleScreenRecorder is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+SimpleScreenRecorder is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with SimpleScreenRecorder.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "FastScaler_Convert.h"
+
+#if SSR_USE_LOONGARCH_ASM
+
+#ifndef SIMDE_ENABLE_NATIVE_ALIASES
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#endif
+//using simde to translate SSSE3 to LSX(loongarch 128-bit simd)
+#include<simde/x86/ssse3.h>
+/*
+==== SSSE3 BGRA-to-YUV444/YUV420 Converter ====
+
+Uses the same principle as the fallback converter, but uses 16-bit integers so it can do 8 operations at once.
+- YUV444: takes blocks of 16x1 pixels, produces 16x1 Y/U/V values
+- YUV422: takes blocks of 16x1 pixels, produces 16x1 Y and 8x1 U/V values
+- YUV420: takes blocks of 16x2 pixels, produces 16x2 Y and 8x1 U/V values
+
+The code uses interleaving to reduce the number of shuffles. So for example the order for red is [ r0 r4 r1 r5 r2 r6 r3 r7 ].
+For the averaging of 2x2 blocks, it uses 32-bit horizontal addition instead of 16-bit because of this interleaving.
+The order of the final result is [ sr0 sr2 sr1 sr3 sr4 sr6 sr5 sr7 ].
+
+If the width is not a multiple of 8/16, the remainder (right edge of the image) is converted without SSSE3.
+
+This converter is about 4 times faster than the fallback converter.
+*/
+
+#define ReadBGRAInterleaved(ptr1, ptr2, ca, cb, r, g, b) \
+	__m128i ca = _mm_loadu_si128((__m128i*) (ptr1)), cb = _mm_loadu_si128((__m128i*) (ptr2)); \
+	__m128i r = _mm_or_si128(_mm_and_si128(_mm_srli_si128(ca, 2), v_byte1), _mm_and_si128(               cb    , v_byte3)); \
+	__m128i g = _mm_or_si128(_mm_and_si128(_mm_srli_si128(ca, 1), v_byte1), _mm_and_si128(_mm_slli_si128(cb, 1), v_byte3)); \
+	__m128i b = _mm_or_si128(_mm_and_si128(               ca    , v_byte1), _mm_and_si128(_mm_slli_si128(cb, 2), v_byte3));
+#define Convert_RGB_Y(r, g, b, y) \
+	__m128i y = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(r, v_mat_yr), _mm_mullo_epi16(g, v_mat_yg)), _mm_add_epi16(_mm_mullo_epi16(b, v_mat_yb), v_offset_y));
+#define Convert_RGB_U(r, g, b, u) \
+	__m128i u = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(r, v_mat_ur), _mm_mullo_epi16(g, v_mat_ug)), _mm_add_epi16(_mm_mullo_epi16(b, v_mat_ub_vr), v_offset_uv));
+#define Convert_RGB_V(r, g, b, v) \
+	__m128i v = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(r, v_mat_ub_vr), _mm_mullo_epi16(g, v_mat_vg)), _mm_add_epi16(_mm_mullo_epi16(b, v_mat_vb), v_offset_uv));
+#define WritePlaneInterleaved(ptr, y1, y2, sh1, sh2) \
+	_mm_stream_si128((__m128i*) (ptr), _mm_or_si128(_mm_shuffle_epi8(y1, sh1), _mm_shuffle_epi8(y2, sh2)));
+
+//void Convert_BGRA_YUV444_SSSE3(...)
+void Convert_BGRA_YUV444_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) {
+	assert((uintptr_t) out_data[0] % 16 == 0 && out_stride[0] % 16 == 0);
+	assert((uintptr_t) out_data[1] % 16 == 0 && out_stride[1] % 16 == 0);
+	assert((uintptr_t) out_data[2] % 16 == 0 && out_stride[2] % 16 == 0);
+
+	__m128i v_byte1     = _mm_set1_epi32(0x000000ff);
+	__m128i v_byte3     = _mm_set1_epi32(0x00ff0000);
+	__m128i v_mat_yr    = _mm_set1_epi16(47);
+	__m128i v_mat_yg    = _mm_set1_epi16(157);
+	__m128i v_mat_yb    = _mm_set1_epi16(16);
+	__m128i v_mat_ur    = _mm_set1_epi16(-26);
+	__m128i v_mat_ug    = _mm_set1_epi16(-86);
+	__m128i v_mat_ub_vr = _mm_set1_epi16(112);
+	__m128i v_mat_vg    = _mm_set1_epi16(-102);
+	__m128i v_mat_vb    = _mm_set1_epi16(-10);
+	__m128i v_offset_y  = _mm_set1_epi16((int16_t) (128 + (16 << 8)));
+	__m128i v_offset_uv = _mm_set1_epi16((int16_t) (128 + (128 << 8)));
+	__m128i v_shuffle1  = _mm_setr_epi8(1, 5, 9, 13, 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1);
+	__m128i v_shuffle2  = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 5, 9, 13, 3, 7, 11, 15);
+
+	const int offset_y = 128 + (16 << 8), offset_uv = 128 + (128 << 8);
+
+	for(unsigned int j = 0; j < h; ++j) {
+		const uint32_t *rgb = (const uint32_t*) (in_data + in_stride * (int) j);
+		uint8_t *yuv_y = out_data[0] + out_stride[0] * (int) j;
+		uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j;
+		uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j;
+		for(unsigned int i = 0; i < w / 16; ++i) {
+			ReadBGRAInterleaved(rgb    , rgb +  4, ca1, cb1, r1, g1, b1);
+			ReadBGRAInterleaved(rgb + 8, rgb + 12, ca2, cb2, r2, g2, b2);
+			_mm_prefetch(rgb + 48, _MM_HINT_T0);
+			rgb += 16;
+			Convert_RGB_Y(r1, g1, b1, y1);
+			Convert_RGB_Y(r2, g2, b2, y2);
+			WritePlaneInterleaved(yuv_y, y1, y2, v_shuffle1, v_shuffle2);
+			yuv_y += 16;
+			Convert_RGB_U(r1, g1, b1, u1);
+			Convert_RGB_U(r2, g2, b2, u2);
+			WritePlaneInterleaved(yuv_u, u1, u2, v_shuffle1, v_shuffle2);
+			yuv_u += 16;
+			Convert_RGB_V(r1, g1, b1, v1);
+			Convert_RGB_V(r2, g2, b2, v2);
+			WritePlaneInterleaved(yuv_v, v1, v2, v_shuffle1, v_shuffle2);
+			yuv_v += 16;
+		}
+		for(unsigned int i = 0; i < (w & 15); ++i) {
+			uint32_t c = *(rgb++);
+			int r = (int) ((c >> 16) & 0xff);
+			int g = (int) ((c >>  8) & 0xff);
+			int b = (int) ((c      ) & 0xff);
+			*(yuv_y++) = ( 47 * r +  157 * g +  16 * b + offset_y) >> 8;
+			*(yuv_u++) = (-26 * r +  -86 * g + 112 * b + offset_uv) >> 8;
+			*(yuv_v++) = (112 * r + -102 * g + -10 * b + offset_uv) >> 8;
+		}
+	}
+
+	_mm_sfence();
+
+}
+
+//void Convert_BGRA_YUV422_SSSE3(...)
+void Convert_BGRA_YUV422_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) {
+	assert(w % 2 == 0);
+	assert((uintptr_t) out_data[0] % 16 == 0 && out_stride[0] % 16 == 0);
+	assert((uintptr_t) out_data[1] % 16 == 0 && out_stride[1] % 16 == 0);
+	assert((uintptr_t) out_data[2] % 16 == 0 && out_stride[2] % 16 == 0);
+
+	__m128i v_byte1     = _mm_set1_epi32(0x000000ff);
+	__m128i v_byte3     = _mm_set1_epi32(0x00ff0000);
+	__m128i v_mat_yr    = _mm_set1_epi16(47);
+	__m128i v_mat_yg    = _mm_set1_epi16(157);
+	__m128i v_mat_yb    = _mm_set1_epi16(16);
+	__m128i v_mat_ur    = _mm_set1_epi16(-26);
+	__m128i v_mat_ug    = _mm_set1_epi16(-86);
+	__m128i v_mat_ub_vr = _mm_set1_epi16(112);
+	__m128i v_mat_vg    = _mm_set1_epi16(-102);
+	__m128i v_mat_vb    = _mm_set1_epi16(-10);
+	__m128i v_offset_y  = _mm_set1_epi16((int16_t) (128 + (16 << 8)));
+	__m128i v_offset_uv = _mm_set1_epi16((int16_t) (128 + (128 << 8)));
+	__m128i v_shuffle1  = _mm_setr_epi8(1, 5, 9, 13, 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1);
+	__m128i v_shuffle2  = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 5, 9, 13, 3, 7, 11, 15);
+	__m128i v_shuffle3  = _mm_setr_epi8(1, 5, 3, 7, 9, 13, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1);
+
+	const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 1;
+
+	for(unsigned int j = 0; j < h; ++j) {
+		const uint32_t *rgb = (const uint32_t*) (in_data + in_stride * (int) j);
+		uint8_t *yuv_y = out_data[0] + out_stride[0] * (int) j;
+		uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j;
+		uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j;
+		for(unsigned int i = 0; i < w / 16; ++i) {
+			ReadBGRAInterleaved(rgb    , rgb +  4, ca1, cb1, r1, g1, b1);
+			ReadBGRAInterleaved(rgb + 8, rgb + 12, ca2, cb2, r2, g2, b2);
+			_mm_prefetch(rgb + 48, _MM_HINT_T0);
+			rgb += 16;
+			Convert_RGB_Y(r1, g1, b1, y1);
+			Convert_RGB_Y(r2, g2, b2, y2);
+			WritePlaneInterleaved(yuv_y, y1, y2, v_shuffle1, v_shuffle2);
+			yuv_y += 16;
+			__m128i ra = _mm_srli_epi16(_mm_hadd_epi32(r1, r2), 1);
+			__m128i ga = _mm_srli_epi16(_mm_hadd_epi32(g1, g2), 1);
+			__m128i ba = _mm_srli_epi16(_mm_hadd_epi32(b1, b2), 1);
+			__m128i u = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ur), _mm_mullo_epi16(ga, v_mat_ug)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_ub_vr), v_offset_uv));
+			_mm_storel_epi64((__m128i*) yuv_u, _mm_shuffle_epi8(u, v_shuffle3));
+			yuv_u += 8;
+			__m128i v = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ub_vr), _mm_mullo_epi16(ga, v_mat_vg)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_vb), v_offset_uv));
+			_mm_storel_epi64((__m128i*) yuv_v, _mm_shuffle_epi8(v, v_shuffle3));
+			yuv_v += 8;
+		}
+		for(unsigned int i = 0; i < (w & 15) / 2; ++i) {
+			uint32_t c1 = rgb[0], c2 = rgb[1];
+			rgb += 2;
+			int r1 = (int) ((c1 >> 16) & 0xff), r2 = (int) ((c2 >> 16) & 0xff);
+			int g1 = (int) ((c1 >>  8) & 0xff), g2 = (int) ((c2 >>  8) & 0xff);
+			int b1 = (int) ((c1      ) & 0xff), b2 = (int) ((c2      ) & 0xff);
+			yuv_y[0] = (47 * r1 + 157 * g1 + 16 * b1 + offset_y) >> 8;
+			yuv_y[1] = (47 * r2 + 157 * g2 + 16 * b2 + offset_y) >> 8;
+			yuv_y += 2;
+			int sr = r1 + r2;
+			int sg = g1 + g2;
+			int sb = b1 + b2;
+			*(yuv_u++) = (-26 * sr +  -86 * sg + 112 * sb + offset_uv) >> 9;
+			*(yuv_v++) = (112 * sr + -102 * sg + -10 * sb + offset_uv) >> 9;
+		}
+	}
+
+	_mm_sfence();
+
+}
+
+//void Convert_BGRA_YUV420_SSSE3(...)
+void Convert_BGRA_YUV420_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) {
+	assert(w % 2 == 0 && h % 2 == 0);
+	assert((uintptr_t) out_data[0] % 16 == 0 && out_stride[0] % 16 == 0);
+	assert((uintptr_t) out_data[1] % 16 == 0 && out_stride[1] % 16 == 0);
+	assert((uintptr_t) out_data[2] % 16 == 0 && out_stride[2] % 16 == 0);
+
+	__m128i v_byte1     = _mm_set1_epi32(0x000000ff);
+	__m128i v_byte3     = _mm_set1_epi32(0x00ff0000);
+	__m128i v_mat_yr    = _mm_set1_epi16(47);
+	__m128i v_mat_yg    = _mm_set1_epi16(157);
+	__m128i v_mat_yb    = _mm_set1_epi16(16);
+	__m128i v_mat_ur    = _mm_set1_epi16(-26);
+	__m128i v_mat_ug    = _mm_set1_epi16(-86);
+	__m128i v_mat_ub_vr = _mm_set1_epi16(112);
+	__m128i v_mat_vg    = _mm_set1_epi16(-102);
+	__m128i v_mat_vb    = _mm_set1_epi16(-10);
+	__m128i v_offset_y  = _mm_set1_epi16((int16_t) (128 + (16 << 8)));
+	__m128i v_offset_uv = _mm_set1_epi16((int16_t) (128 + (128 << 8)));
+	__m128i v_2         = _mm_set1_epi16(2);
+	__m128i v_shuffle1  = _mm_setr_epi8(1, 5, 9, 13, 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1);
+	__m128i v_shuffle2  = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 5, 9, 13, 3, 7, 11, 15);
+	__m128i v_shuffle3  = _mm_setr_epi8(1, 5, 3, 7, 9, 13, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1);
+
+	const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 2;
+
+	for(unsigned int j = 0; j < h / 2; ++j) {
+		const uint32_t *rgb1 = (const uint32_t*) (in_data + in_stride * (int) (j * 2));
+		const uint32_t *rgb2 = (const uint32_t*) (in_data + in_stride * (int) (j * 2 + 1));
+		uint8_t *yuv_y1 = out_data[0] + out_stride[0] * (int) (j * 2);
+		uint8_t *yuv_y2 = out_data[0] + out_stride[0] * (int) (j * 2 + 1);
+		uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j;
+		uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j;
+		for(unsigned int i = 0; i < w / 16; ++i) {
+			__m128i ra, ga, ba;
+			{
+				ReadBGRAInterleaved(rgb1    , rgb1 +  4, ca1, cb1, r1, g1, b1);
+				ReadBGRAInterleaved(rgb1 + 8, rgb1 + 12, ca2, cb2, r2, g2, b2);
+				rgb1 += 16;
+				Convert_RGB_Y(r1, g1, b1, y1);
+				Convert_RGB_Y(r2, g2, b2, y2);
+				WritePlaneInterleaved(yuv_y1, y1, y2, v_shuffle1, v_shuffle2);
+				yuv_y1 += 16;
+				_mm_prefetch(rgb1 + 16, _MM_HINT_T0);
+				ra = _mm_hadd_epi32(r1, r2);
+				ga = _mm_hadd_epi32(g1, g2);
+				ba = _mm_hadd_epi32(b1, b2);
+			}
+			{
+				ReadBGRAInterleaved(rgb2    , rgb2 +  4, ca1, cb1, r1, g1, b1);
+				ReadBGRAInterleaved(rgb2 + 8, rgb2 + 12, ca2, cb2, r2, g2, b2);
+				rgb2 += 16;
+				Convert_RGB_Y(r1, g1, b1, y1);
+				Convert_RGB_Y(r2, g2, b2, y2);
+				WritePlaneInterleaved(yuv_y2, y1, y2, v_shuffle1, v_shuffle2);
+				yuv_y2 += 16;
+				_mm_prefetch(rgb2 + 16, _MM_HINT_T0);
+				ra = _mm_add_epi16(ra, _mm_hadd_epi32(r1, r2));
+				ga = _mm_add_epi16(ga, _mm_hadd_epi32(g1, g2));
+				ba = _mm_add_epi16(ba, _mm_hadd_epi32(b1, b2));
+			}
+			{
+				ra = _mm_srli_epi16(_mm_add_epi16(ra, v_2), 2);
+				ga = _mm_srli_epi16(_mm_add_epi16(ga, v_2), 2);
+				ba = _mm_srli_epi16(_mm_add_epi16(ba, v_2), 2);
+				__m128i u = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ur), _mm_mullo_epi16(ga, v_mat_ug)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_ub_vr), v_offset_uv));
+				_mm_storel_epi64((__m128i*) yuv_u, _mm_shuffle_epi8(u, v_shuffle3));
+				yuv_u += 8;
+				__m128i v = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ub_vr), _mm_mullo_epi16(ga, v_mat_vg)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_vb), v_offset_uv));
+				_mm_storel_epi64((__m128i*) yuv_v, _mm_shuffle_epi8(v, v_shuffle3));
+				yuv_v += 8;
+			}
+		}
+		for(unsigned int i = 0; i < (w & 15) / 2; ++i) {
+			uint32_t c1 = rgb1[0], c2 = rgb1[1], c3 = rgb2[0], c4 = rgb2[1];
+			rgb1 += 2; rgb2 += 2;
+			int r1 = (int) ((c1 >> 16) & 0xff), r2 = (int) ((c2 >> 16) & 0xff), r3 = (int) ((c3 >> 16) & 0xff), r4 = (int) ((c4 >> 16) & 0xff);
+			int g1 = (int) ((c1 >>  8) & 0xff), g2 = (int) ((c2 >>  8) & 0xff), g3 = (int) ((c3 >>  8) & 0xff), g4 = (int) ((c4 >>  8) & 0xff);
+			int b1 = (int) ((c1      ) & 0xff), b2 = (int) ((c2      ) & 0xff), b3 = (int) ((c3      ) & 0xff), b4 = (int) ((c4      ) & 0xff);
+			yuv_y1[0] = (47 * r1 + 157 * g1 + 16 * b1 + offset_y) >> 8;
+			yuv_y1[1] = (47 * r2 + 157 * g2 + 16 * b2 + offset_y) >> 8;
+			yuv_y2[0] = (47 * r3 + 157 * g3 + 16 * b3 + offset_y) >> 8;
+			yuv_y2[1] = (47 * r4 + 157 * g4 + 16 * b4 + offset_y) >> 8;
+			yuv_y1 += 2; yuv_y2 += 2;
+			int sr = r1 + r2 + r3 + r4;
+			int sg = g1 + g2 + g3 + g4;
+			int sb = b1 + b2 + b3 + b4;
+			*(yuv_u++) = (-26 * sr +  -86 * sg + 112 * sb + offset_uv) >> 10;
+			*(yuv_v++) = (112 * sr + -102 * sg + -10 * sb + offset_uv) >> 10;
+		}
+	}
+
+	_mm_sfence();
+
+}
+
+//void Convert_BGRA_NV12_SSSE3(...)
+void Convert_BGRA_NV12_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[2], const int out_stride[2]) {
+	assert(w % 2 == 0 && h % 2 == 0);
+	assert((uintptr_t) out_data[0] % 16 == 0 && out_stride[0] % 16 == 0);
+	assert((uintptr_t) out_data[1] % 16 == 0 && out_stride[1] % 16 == 0);
+
+	__m128i v_byte1     = _mm_set1_epi32(0x000000ff);
+	__m128i v_byte3     = _mm_set1_epi32(0x00ff0000);
+	__m128i v_mat_yr    = _mm_set1_epi16(47);
+	__m128i v_mat_yg    = _mm_set1_epi16(157);
+	__m128i v_mat_yb    = _mm_set1_epi16(16);
+	__m128i v_mat_ur    = _mm_set1_epi16(-26);
+	__m128i v_mat_ug    = _mm_set1_epi16(-86);
+	__m128i v_mat_ub_vr = _mm_set1_epi16(112);
+	__m128i v_mat_vg    = _mm_set1_epi16(-102);
+	__m128i v_mat_vb    = _mm_set1_epi16(-10);
+	__m128i v_offset_y  = _mm_set1_epi16((int16_t) (128 + (16 << 8)));
+	__m128i v_offset_uv = _mm_set1_epi16((int16_t) (128 + (128 << 8)));
+	__m128i v_2         = _mm_set1_epi16(2);
+	__m128i v_shuffle1  = _mm_setr_epi8( 1,  5,  9, 13,  3,  7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1);
+	__m128i v_shuffle2  = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,  1,  5,  9, 13,  3,  7, 11, 15);
+	__m128i v_shuffle3  = _mm_setr_epi8( 1, -1,  5, -1,  3, -1,  7, -1,  9, -1, 13, -1, 11, -1, 15, -1);
+	__m128i v_shuffle4  = _mm_setr_epi8(-1,  1, -1,  5, -1,  3, -1,  7, -1,  9, -1, 13, -1, 11, -1, 15);
+
+	const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 2;
+
+	for(unsigned int j = 0; j < h / 2; ++j) {
+		const uint32_t *rgb1 = (const uint32_t*) (in_data + in_stride * (int) (j * 2));
+		const uint32_t *rgb2 = (const uint32_t*) (in_data + in_stride * (int) (j * 2 + 1));
+		uint8_t *yuv_y1 = out_data[0] + out_stride[0] * (int) (j * 2);
+		uint8_t *yuv_y2 = out_data[0] + out_stride[0] * (int) (j * 2 + 1);
+		uint8_t *yuv_uv = out_data[1] + out_stride[1] * (int) j;
+		for(unsigned int i = 0; i < w / 16; ++i) {
+			__m128i ra, ga, ba;
+			{
+				ReadBGRAInterleaved(rgb1    , rgb1 +  4, ca1, cb1, r1, g1, b1);
+				ReadBGRAInterleaved(rgb1 + 8, rgb1 + 12, ca2, cb2, r2, g2, b2);
+				rgb1 += 16;
+				Convert_RGB_Y(r1, g1, b1, y1);
+				Convert_RGB_Y(r2, g2, b2, y2);
+				WritePlaneInterleaved(yuv_y1, y1, y2, v_shuffle1, v_shuffle2);
+				yuv_y1 += 16;
+				_mm_prefetch(rgb1 + 16, _MM_HINT_T0);
+				ra = _mm_hadd_epi32(r1, r2);
+				ga = _mm_hadd_epi32(g1, g2);
+				ba = _mm_hadd_epi32(b1, b2);
+			}
+			{
+				ReadBGRAInterleaved(rgb2    , rgb2 +  4, ca1, cb1, r1, g1, b1);
+				ReadBGRAInterleaved(rgb2 + 8, rgb2 + 12, ca2, cb2, r2, g2, b2);
+				rgb2 += 16;
+				Convert_RGB_Y(r1, g1, b1, y1);
+				Convert_RGB_Y(r2, g2, b2, y2);
+				WritePlaneInterleaved(yuv_y2, y1, y2, v_shuffle1, v_shuffle2);
+				yuv_y2 += 16;
+				_mm_prefetch(rgb2 + 16, _MM_HINT_T0);
+				ra = _mm_add_epi16(ra, _mm_hadd_epi32(r1, r2));
+				ga = _mm_add_epi16(ga, _mm_hadd_epi32(g1, g2));
+				ba = _mm_add_epi16(ba, _mm_hadd_epi32(b1, b2));
+			}
+			{
+				ra = _mm_srli_epi16(_mm_add_epi16(ra, v_2), 2);
+				ga = _mm_srli_epi16(_mm_add_epi16(ga, v_2), 2);
+				ba = _mm_srli_epi16(_mm_add_epi16(ba, v_2), 2);
+				__m128i u = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ur), _mm_mullo_epi16(ga, v_mat_ug)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_ub_vr), v_offset_uv));
+				__m128i v = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ub_vr), _mm_mullo_epi16(ga, v_mat_vg)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_vb), v_offset_uv));
+				WritePlaneInterleaved(yuv_uv, u, v, v_shuffle3, v_shuffle4);
+				yuv_uv += 16;
+			}
+		}
+		for(unsigned int i = 0; i < (w & 15) / 2; ++i) {
+			uint32_t c1 = rgb1[0], c2 = rgb1[1], c3 = rgb2[0], c4 = rgb2[1];
+			rgb1 += 2; rgb2 += 2;
+			int r1 = (int) ((c1 >> 16) & 0xff), r2 = (int) ((c2 >> 16) & 0xff), r3 = (int) ((c3 >> 16) & 0xff), r4 = (int) ((c4 >> 16) & 0xff);
+			int g1 = (int) ((c1 >>  8) & 0xff), g2 = (int) ((c2 >>  8) & 0xff), g3 = (int) ((c3 >>  8) & 0xff), g4 = (int) ((c4 >>  8) & 0xff);
+			int b1 = (int) ((c1      ) & 0xff), b2 = (int) ((c2      ) & 0xff), b3 = (int) ((c3      ) & 0xff), b4 = (int) ((c4      ) & 0xff);
+			yuv_y1[0] = (47 * r1 + 157 * g1 + 16 * b1 + offset_y) >> 8;
+			yuv_y1[1] = (47 * r2 + 157 * g2 + 16 * b2 + offset_y) >> 8;
+			yuv_y2[0] = (47 * r3 + 157 * g3 + 16 * b3 + offset_y) >> 8;
+			yuv_y2[1] = (47 * r4 + 157 * g4 + 16 * b4 + offset_y) >> 8;
+			yuv_y1 += 2; yuv_y2 += 2;
+			int sr = r1 + r2 + r3 + r4;
+			int sg = g1 + g2 + g3 + g4;
+			int sb = b1 + b2 + b3 + b4;
+			yuv_uv[0] = (-26 * sr +  -86 * sg + 112 * sb + offset_uv) >> 10;
+			yuv_uv[1] = (112 * sr + -102 * sg + -10 * sb + offset_uv) >> 10;
+			yuv_uv += 2;
+		}
+	}
+
+	_mm_sfence();
+
+}
+
+/*
+==== SSSE3 BGRA-to-BGR Converter ====
+
+Same as the fallback converter, but with a larger block size and shuffles instead of shifts and bitwise or.
+- BGR: converts blocks of 16x1 pixels
+*/
+
+//void Convert_BGRA_BGR_SSSE3(...)
+void Convert_BGRA_BGR_LSX(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* out_data, int out_stride) {
+	assert((uintptr_t) out_data % 16 == 0 && out_stride % 16 == 0);
+
+	__m128i v_shuffle1  = _mm_setr_epi8( 0,  1,  2,  4,  5,  6,  8,  9, 10, 12, 13, 14, -1, -1, -1, -1);
+	__m128i v_shuffle2  = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  1,  2,  4);
+	__m128i v_shuffle3  = _mm_setr_epi8( 5,  6,  8,  9, 10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1);
+	__m128i v_shuffle4  = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,  0,  1,  2,  4,  5,  6,  8,  9);
+	__m128i v_shuffle5  = _mm_setr_epi8(10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+	__m128i v_shuffle6  = _mm_setr_epi8(-1, -1, -1, -1,  0,  1,  2,  4,  5,  6,  8,  9, 10, 12, 13, 14);
+
+	for(unsigned int j = 0; j < h; ++j) {
+		const uint8_t *in = in_data + in_stride * (int) j;
+		uint8_t *out = out_data + out_stride * (int) j;
+		for(unsigned int i = 0; i < w / 16; ++i) {
+			__m128i c0 = _mm_loadu_si128((__m128i*) (in     ));
+			__m128i c1 = _mm_loadu_si128((__m128i*) (in + 16));
+			__m128i c2 = _mm_loadu_si128((__m128i*) (in + 32));
+			__m128i c3 = _mm_loadu_si128((__m128i*) (in + 48));
+			//_mm_prefetch(in + 192, _MM_HINT_T0);
+			in += 64;
+			_mm_stream_si128((__m128i*) (out     ), _mm_or_si128(_mm_shuffle_epi8(c0, v_shuffle1), _mm_shuffle_epi8(c1, v_shuffle2)));
+			_mm_stream_si128((__m128i*) (out + 16), _mm_or_si128(_mm_shuffle_epi8(c1, v_shuffle3), _mm_shuffle_epi8(c2, v_shuffle4)));
+			_mm_stream_si128((__m128i*) (out + 32), _mm_or_si128(_mm_shuffle_epi8(c2, v_shuffle5), _mm_shuffle_epi8(c3, v_shuffle6)));
+			out += 48;
+		}
+		for(unsigned int i = 0; i < (w & 15); ++i) {
+			uint32_t c = *((uint32_t*) in);
+			in += 4;
+			out[0] = c;
+			out[1] = c >> 8;
+			out[2] = c >> 16;
+			out += 3;
+		}
+	}
+
+	_mm_sfence();
+
+}
+
+#endif
diff --git a/src/AV/FastScaler_Scale.h b/src/AV/FastScaler_Scale.h
index c6cc3787..f6dac95e 100644
--- a/src/AV/FastScaler_Scale.h
+++ b/src/AV/FastScaler_Scale.h
@@ -27,3 +27,7 @@ void Scale_BGRA_Fallback(unsigned int in_w, unsigned int in_h, const uint8_t* in
 void Scale_BGRA_SSSE3(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
 					  unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride);
 #endif
+#if SSR_USE_LOONGARCH_ASM
+void Scale_BGRA_LSX(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
+					  unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride);
+#endif
diff --git a/src/AV/FastScaler_Scale_LSX.cpp b/src/AV/FastScaler_Scale_LSX.cpp
new file mode 100644
index 00000000..07a2a801
--- /dev/null
+++ b/src/AV/FastScaler_Scale_LSX.cpp
@@ -0,0 +1,342 @@
+/*
+Copyright (c) 2012-2024 Maarten Baert <maarten-baert@hotmail.com>
+
+This file is part of SimpleScreenRecorder.
+
+SimpleScreenRecorder is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+SimpleScreenRecorder is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with SimpleScreenRecorder.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "FastScaler_Scale.h"
+
+#include "FastScaler_Scale_Generic.h"
+#include "TempBuffer.h"
+
+#if SSR_USE_LOONGARCH_ASM
+
+#ifndef SIMDE_ENABLE_NATIVE_ALIASES
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#endif
+//using simde to translate SSSE3 to LSX(loongarch 128-bit simd)
+#include <simde/x86/ssse3.h>
+
+/*
+==== SSSE3 MipMapper ====
+
+Very similar to the fallback mipmapper. There are three different SSSE3 kernels depending on the horizontal mipmap factor (mx).
+The principle is the same as with 'wannabe-SIMD', but here we want to use larger reads/writes so horizontal addition is used.
+This complicates the loops a lot and this is the reason why there are three different kernels: the first one has no horizontal addition,
+the second one has one horizontal addition, and the third one has three horizontal additions. The horizontal additions are slower and not associative,
+so they are avoided as much as possible by delaying them until the end.
+
+The remainders (edges of the image that require special attention) don't use SSSE3 because it's not worth it.
+
+You won't see huge improvements compared to the fallback mipmapper, since both algorithms are usually limited by the memory bandwidth.
+
+It's important that this function is force-inlined because this allows the compiler to eliminate the inner loops for common mipmap factors.
+*/
+
+//void MipMap_BGRA_SSSE3_Dynamic(...)
+inline __attribute__((always_inline))
+void MipMap_BGRA_LSX_Dynamic(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
+							   uint8_t* out_data, int out_stride, unsigned int mx, unsigned int my) {
+	assert((uintptr_t) out_data % 16 == 0 && out_stride % 16 == 0);
+	__m128i v_mask = _mm_set1_epi16(0xff);
+	__m128i v_offset = _mm_set1_epi16(1u << (mx + my - 1));
+	const uint64_t mask = vec4x16(0xff);
+	const uint64_t offset = vec4x16(1u << (mx + my - 1));
+	unsigned int wrem = in_w & ((1u << mx) - 1);
+	unsigned int hrem = in_h & ((1u << my) - 1);
+	for(unsigned int out_j = 0; out_j < (in_h >> my); ++out_j) {
+		const uint32_t *in = (const uint32_t*) (in_data + in_stride * (int) (out_j << my));
+		uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j);
+		unsigned int blockrem;
+		if(mx == 0) {
+			for(unsigned int out_i = 0; out_i < (in_w >> (mx + 2)); ++out_i) {
+				__m128i sum1br = _mm_setzero_si128(), sum1ga = _mm_setzero_si128();
+				const uint32_t *in2 = in;
+				for(unsigned int mj = 0; mj < (1u << my); ++mj) {
+					__m128i c1 = _mm_loadu_si128((__m128i*) in2);
+					sum1br = _mm_add_epi16(sum1br, _mm_and_si128(c1, v_mask));
+					sum1ga = _mm_add_epi16(sum1ga, _mm_and_si128(_mm_srli_si128(c1, 1), v_mask));
+					in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
+				}
+				in += 4;
+				__m128i qbr = _mm_srli_epi16(_mm_add_epi16(sum1br, v_offset), my);
+				__m128i qga = _mm_srli_epi16(_mm_add_epi16(sum1ga, v_offset), my);
+				_mm_stream_si128((__m128i*) out, _mm_or_si128(qbr, _mm_slli_si128(qga, 1)));
+				out += 4;
+			}
+			blockrem = (in_w >> mx) & 3;
+		} else if(mx == 1) {
+			for(unsigned int out_i = 0; out_i < (in_w >> (mx + 2)); ++out_i) {
+				__m128i sum1br = _mm_setzero_si128(), sum1ga = _mm_setzero_si128(), sum2br = _mm_setzero_si128(), sum2ga = _mm_setzero_si128();
+				const uint32_t *in2 = in;
+				for(unsigned int mj = 0; mj < (1u << my); ++mj) {
+					__m128i c1 = _mm_loadu_si128((__m128i*) in2);
+					__m128i c2 = _mm_loadu_si128((__m128i*) (in2 + 4));
+					sum1br = _mm_add_epi16(sum1br, _mm_and_si128(c1, v_mask));
+					sum1ga = _mm_add_epi16(sum1ga, _mm_and_si128(_mm_srli_si128(c1, 1), v_mask));
+					sum2br = _mm_add_epi16(sum2br, _mm_and_si128(c2, v_mask));
+					sum2ga = _mm_add_epi16(sum2ga, _mm_and_si128(_mm_srli_si128(c2, 1), v_mask));
+					in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
+				}
+				in += 8;
+				__m128i qbr = _mm_srli_epi16(_mm_add_epi16(_mm_hadd_epi32(sum1br, sum2br), v_offset), 1 + my);
+				__m128i qga = _mm_srli_epi16(_mm_add_epi16(_mm_hadd_epi32(sum1ga, sum2ga), v_offset), 1 + my);
+				_mm_stream_si128((__m128i*) out, _mm_or_si128(qbr, _mm_slli_si128(qga, 1)));
+				out += 4;
+			}
+			blockrem = (in_w >> mx) & 3;
+		} else {
+			for(unsigned int out_i = 0; out_i < (in_w >> (mx + 1)); ++out_i) {
+				__m128i sum1br = _mm_setzero_si128(), sum1ga = _mm_setzero_si128(), sum2br = _mm_setzero_si128(), sum2ga = _mm_setzero_si128();
+				const uint32_t *in2 = in;
+				for(unsigned int mj = 0; mj < (1u << my); ++mj) {
+					for(unsigned int mi = 0; mi < (1u << (mx - 2)); ++mi) {
+						__m128i c1 = _mm_loadu_si128((__m128i*) (in2 + mi * 4));
+						sum1br = _mm_add_epi16(sum1br, _mm_and_si128(c1, v_mask));
+						sum1ga = _mm_add_epi16(sum1ga, _mm_and_si128(_mm_srli_si128(c1, 1), v_mask));
+					}
+					for(unsigned int mi = (1u << (mx - 2)); mi < (1u << (mx - 1)); ++mi) {
+						__m128i c2 = _mm_loadu_si128((__m128i*) (in2 + mi * 4));
+						sum2br = _mm_add_epi16(sum2br, _mm_and_si128(c2, v_mask));
+						sum2ga = _mm_add_epi16(sum2ga, _mm_and_si128(_mm_srli_si128(c2, 1), v_mask));
+					}
+					in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
+				}
+				in += (1u << (mx + 1));
+				__m128i q = _mm_srli_epi16(_mm_add_epi16(_mm_hadd_epi32(_mm_hadd_epi32(sum1br, sum2br), _mm_hadd_epi32(sum1ga, sum2ga)), v_offset), mx + my);
+#if defined(__x86_64__) && TEST_GCC_VERSION(4, 8)
+				_mm_stream_si64((long long*) out, _mm_cvtsi128_si64(_mm_or_si128(q, _mm_srli_si128(q, 7))));
+#else
+				_mm_storel_epi64((__m128i*) out, _mm_or_si128(q, _mm_srli_si128(q, 7)));
+#endif
+				out += 2;
+			}
+			blockrem = (in_w >> mx) & 1;
+		}
+		for(unsigned int out_i = 0; out_i < blockrem; ++out_i) {
+			uint64_t sum = 0;
+			const uint32_t *in2 = in;
+			for(unsigned int mj = 0; mj < (1u << my); ++mj) {
+				for(unsigned int mi = 0; mi < (1u << mx); ++mi) {
+					uint64_t c = in2[mi];
+					sum += ((c << 24) | c) & mask;
+				}
+				in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
+			}
+			in += (1u << mx);
+			uint64_t q = ((sum + offset) >> (mx + my)) & mask;
+			*(out++) = ((uint32_t) (q >> 24)) | ((uint32_t) q);
+		}
+		if(wrem != 0) {
+			uint64_t sum = 0;
+			const uint32_t *in2 = in;
+			for(unsigned int mj = 0; mj < (1u << my); ++mj) {
+				for(unsigned int mi = 0; mi < wrem - 1; ++mi) {
+					uint64_t c = in2[mi];
+					sum += ((c << 24) | c) & mask;
+				}
+				uint64_t c = in2[wrem - 1];
+				sum += (((c << 24) | c) & mask) * ((1u << mx) - (wrem - 1));
+				in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
+			}
+			uint64_t q = ((sum + offset) >> (mx + my)) & mask;
+			*out = ((uint32_t) (q >> 24)) | ((uint32_t) q);
+		}
+	}
+	if(hrem != 0) {
+		unsigned int out_j = in_h >> my;
+		const uint32_t *in = (const uint32_t*) (in_data + in_stride * (int) (out_j << my));
+		uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j);
+		for(unsigned int out_i = 0; out_i < (in_w >> mx); ++out_i) {
+			uint64_t sum = 0;
+			const uint32_t *in2 = in;
+			for(unsigned int mj = 0; mj < hrem - 1; ++mj) {
+				for(unsigned int mi = 0; mi < (1u << mx); ++mi) {
+					uint64_t c = in2[mi];
+					sum += ((c << 24) | c) & mask;
+				}
+				in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
+			}
+			for(unsigned int mi = 0; mi < (1u << mx); ++mi) {
+				uint64_t c = in2[mi];
+				sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1));
+			}
+			in += (1u << mx);
+			uint64_t q = ((sum + offset) >> (mx + my)) & mask;
+			*(out++) = ((uint32_t) (q >> 24)) | ((uint32_t) q);
+		}
+		if(wrem != 0) {
+			uint64_t sum = 0;
+			const uint32_t *in2 = in;
+			for(unsigned int mj = 0; mj < hrem - 1; ++mj) {
+				for(unsigned int mi = 0; mi < wrem - 1; ++mi) {
+					uint64_t c = in2[mi];
+					sum += ((c << 24) | c) & mask;
+				}
+				uint64_t c = in2[wrem - 1];
+				sum += (((c << 24) | c) & mask) * ((1u << mx) - (wrem - 1));
+				in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
+			}
+			for(unsigned int mi = 0; mi < wrem - 1; ++mi) {
+				uint64_t c = in2[mi];
+				sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1));
+			}
+			uint64_t c = in2[wrem - 1];
+			sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1)) * ((1u << mx) - (wrem - 1));
+			uint64_t q = ((sum + offset) >> (mx + my)) & mask;
+			*out = ((uint32_t) (q >> 24)) | ((uint32_t) q);
+		}
+	}
+	_mm_sfence();
+}
+
+//void void Scale_BGRA_SSSE3(...)
+void MipMap_BGRA_LSX(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
+				  uint8_t* out_data, int out_stride, unsigned int mx, unsigned int my) {
+	assert(mx + my <= 8);
+	switch((mx << 4) | my) {
+		case 0x00: assert(false); break;
+		case 0x01: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 0, 1); break;
+		case 0x02: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 0, 2); break;
+		case 0x03: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 0, 3); break;
+		case 0x10: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 0); break;
+		case 0x11: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 1); break;
+		case 0x12: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 2); break;
+		case 0x13: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 3); break;
+		case 0x20: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 0); break;
+		case 0x21: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 1); break;
+		case 0x22: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 2); break;
+		case 0x23: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 3); break;
+		case 0x30: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 0); break;
+		case 0x31: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 1); break;
+		case 0x32: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 2); break;
+		case 0x33: MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 3); break;
+		default:   MipMap_BGRA_LSX_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, mx, my); break;
+	}
+}
+
+/*
+==== SSSE3 Bilinear Scaler ====
+
+Same principle as the fallback scaler, but this version produces two pixels per iteration. That means it can read 64-bit blocks and write 64-bit blocks,
+and the shuffles are also more efficient than just shifting.
+*/
+
+//void Bilinear_BGRA_SSSE3(...)
+void Bilinear_BGRA_LSX(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
+						 unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride,
+						 unsigned int mx, unsigned int my) {
+	assert(in_w > 1 && in_h > 1); //TODO// support size 1?
+	assert(out_w > 1 && out_h > 1); //TODO// support size 1?
+	assert(in_w < (1 << 28) && in_h < (1 << 28));
+	assert(out_w < (1 << 28) && out_h < (1 << 28));
+	assert((uintptr_t) out_data % 16 == 0 && out_stride % 16 == 0);
+
+	// precompute horizontal offsets and fractions
+	TempBuffer<unsigned int> x_offset_table;
+	TempBuffer<uint64_t> x_fraction_table;
+	x_offset_table.Alloc(out_w);
+	x_fraction_table.Alloc(out_w);
+	for(unsigned int out_i = 0; out_i < out_w; ++out_i) {
+		unsigned int x_fraction;
+		Bilinear_MapIndex(out_i, in_w, out_w, mx, x_offset_table[out_i], x_fraction);
+		x_fraction_table[out_i] = ((uint64_t) x_fraction << 48) | ((uint64_t) x_fraction << 32) | ((uint64_t) x_fraction << 16) | ((uint64_t) x_fraction);
+	}
+
+	// constants
+	__m128i v_128      = _mm_set1_epi16(128);
+	__m128i v_256      = _mm_set1_epi16(256);
+	__m128i v_shuffle1 = _mm_setr_epi8( 0, -1,  1, -1,  2, -1,  3, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+	__m128i v_shuffle2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,  0, -1,  1, -1,  2, -1,  3, -1);
+	__m128i v_shuffle3 = _mm_setr_epi8( 4, -1,  5, -1,  6, -1 , 7, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+	__m128i v_shuffle4 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,  4, -1,  5, -1,  6, -1,  7, -1);
+	__m128i v_shuffle5 = _mm_setr_epi8( 1,  3,  5,  7,  9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1);
+
+	// scale
+	for(unsigned int out_j = 0; out_j < out_h; ++out_j) {
+		unsigned int y_offset, y_fraction;
+		Bilinear_MapIndex(out_j, in_h, out_h, my, y_offset, y_fraction);
+		__m128i vy_fraction = _mm_set1_epi16(y_fraction);
+		__m128i vy_fraction_inv = _mm_sub_epi16(v_256, vy_fraction);
+		unsigned int *x_offset_ptr = x_offset_table.GetData();
+		uint64_t *x_fraction_ptr = x_fraction_table.GetData();
+		const uint32_t *in1 = (const uint32_t*) (in_data + in_stride * (int) y_offset);
+		const uint32_t *in2 = (const uint32_t*) (in_data + in_stride * ((int) y_offset + 1));
+		uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j);
+		for(unsigned int out_i = 0; out_i < out_w / 2; ++out_i) {
+
+			unsigned int x_offset1 = x_offset_ptr[0];
+			unsigned int x_offset2 = x_offset_ptr[1];
+			__m128i vx_fraction = _mm_load_si128((__m128i*) x_fraction_ptr);
+			__m128i vx_fraction_inv = _mm_sub_epi16(v_256, vx_fraction);
+			x_offset_ptr += 2;
+			x_fraction_ptr += 2;
+
+			__m128i c1a = _mm_loadl_epi64((__m128i*) (in1 + x_offset1));
+			__m128i c2a = _mm_loadl_epi64((__m128i*) (in1 + x_offset2));
+			__m128i c1b = _mm_loadl_epi64((__m128i*) (in2 + x_offset1));
+			__m128i c2b = _mm_loadl_epi64((__m128i*) (in2 + x_offset2));
+
+			//_mm_prefetch(in1 + x_offset2 + 64, _MM_HINT_T0);
+			//_mm_prefetch(in2 + x_offset2 + 64, _MM_HINT_T0);
+
+			__m128i p1 = _mm_or_si128(_mm_shuffle_epi8(c1a, v_shuffle1), _mm_shuffle_epi8(c2a, v_shuffle2));
+			__m128i p2 = _mm_or_si128(_mm_shuffle_epi8(c1a, v_shuffle3), _mm_shuffle_epi8(c2a, v_shuffle4));
+			__m128i q1 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(p1, vx_fraction_inv), v_128), _mm_mullo_epi16(p2, vx_fraction)), 8);
+
+			__m128i p3 = _mm_or_si128(_mm_shuffle_epi8(c1b, v_shuffle1), _mm_shuffle_epi8(c2b, v_shuffle2));
+			__m128i p4 = _mm_or_si128(_mm_shuffle_epi8(c1b, v_shuffle3), _mm_shuffle_epi8(c2b, v_shuffle4));
+			__m128i q2 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(p3, vx_fraction_inv), v_128), _mm_mullo_epi16(p4, vx_fraction)), 8);
+
+			__m128i r = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(q1, vy_fraction_inv), v_128), _mm_mullo_epi16(q2, vy_fraction));
+
+			_mm_storel_epi64((__m128i*) out, _mm_shuffle_epi8(r, v_shuffle5));
+			out += 2;
+
+		}
+		if(out_w & 1) {
+
+			unsigned int x_offset1 = x_offset_ptr[0];
+			__m128i vx_fraction = _mm_loadl_epi64((__m128i*) x_fraction_ptr);
+			__m128i vx_fraction_inv = _mm_sub_epi16(v_256, vx_fraction);
+
+			__m128i c1a = _mm_loadl_epi64((__m128i*) (in1 + x_offset1));
+			__m128i c1b = _mm_loadl_epi64((__m128i*) (in2 + x_offset1));
+
+			__m128i p1 = _mm_shuffle_epi8(c1a, v_shuffle1);
+			__m128i p2 = _mm_shuffle_epi8(c1a, v_shuffle3);
+			__m128i q1 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(p1, vx_fraction_inv), v_128), _mm_mullo_epi16(p2, vx_fraction)), 8);
+
+			__m128i p3 = _mm_shuffle_epi8(c1b, v_shuffle1);
+			__m128i p4 = _mm_shuffle_epi8(c1b, v_shuffle3);
+			__m128i q2 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(p3, vx_fraction_inv), v_128), _mm_mullo_epi16(p4, vx_fraction)), 8);
+
+			__m128i r = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(q1, vy_fraction_inv), v_128), _mm_mullo_epi16(q2, vy_fraction));
+
+			*out = _mm_cvtsi128_si32(_mm_shuffle_epi8(r, v_shuffle5));
+
+		}
+	}
+
+}
+
+//void Scale_BGRA_SSSE3(...)
+void Scale_BGRA_LSX(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
+					  unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride) {
+	Scale_BGRA_Generic(in_w, in_h, in_data, in_stride, out_w, out_h, out_data, out_stride, MipMap_BGRA_LSX, Bilinear_BGRA_LSX);
+}
+
+#endif
diff --git a/src/Benchmark.cpp b/src/Benchmark.cpp
index ae1ad518..af5426f1 100644
--- a/src/Benchmark.cpp
+++ b/src/Benchmark.cpp
@@ -114,7 +114,7 @@ void BenchmarkScale(unsigned int in_w, unsigned int in_h, unsigned int out_w, un
 	}
 
 	// run test
-	unsigned int time_swscale = 0, time_fallback = 0, time_ssse3 = 0;
+	unsigned int time_swscale = 0, time_fallback = 0, time_ssse3 = 0, time_lsx = 0;
 	{
 		SwsContext *sws = sws_getCachedContext(NULL,
 											   in_w, in_h, AV_PIX_FMT_BGRA,
@@ -158,15 +158,28 @@ void BenchmarkScale(unsigned int in_w, unsigned int in_h, unsigned int out_w, un
 		time_ssse3 = (t2 - t1) / run_size;
 	}
 #endif
+#if SSR_USE_LOONGARCH_ASM
+	if(CPUFeatures::HasLSX()) {
+		int64_t t1 = hrt_time_micro();
+		for(unsigned int i = 0; i < run_size; ++i) {
+			unsigned int ii = i % queue_size;
+			Scale_BGRA_LSX(in_w, in_h, queue_in[ii]->m_data[0], queue_in[ii]->m_stride[0],
+							 out_w, out_h, queue_out[ii]->m_data[0], queue_out[ii]->m_stride[0]);
+		}
+		int64_t t2 = hrt_time_micro();
+		time_lsx = (t2 - t1) / run_size;
+	}
+#endif
 
 	// print result
 	QString in_size = QString("%1x%2").arg(in_w).arg(in_h);
 	QString out_size = QString("%1x%2").arg(out_w).arg(out_h);
-	Logger::LogInfo("[BenchmarkScale] " + Logger::tr("BGRA %1 to BGRA %2  |  SWScale %3 us  |  Fallback %4 us (%5%)  |  SSSE3 %6 us (%7%)")
+	Logger::LogInfo("[BenchmarkScale] " + Logger::tr("BGRA %1 to BGRA %2  |  SWScale %3 us  |  Fallback %4 us (%5%)  |  SSSE3 %6 us (%7%)  |  LSX %8 us (%9%)")
 					.arg(in_size, 9).arg(out_size, 9)
 					.arg(time_swscale, 6)
 					.arg(time_fallback, 6).arg(100 * time_fallback / time_swscale, 3)
-					.arg(time_ssse3, 6).arg(100 * time_ssse3 / time_fallback, 3));
+					.arg(time_ssse3, 6).arg(100 * time_ssse3 / time_fallback, 3)
+                                        .arg(time_lsx, 6).arg(100 * time_lsx / time_fallback, 3));
 
 }
 
@@ -174,6 +187,9 @@ void BenchmarkConvert(unsigned int w, unsigned int h, AVPixelFormat in_format, A
 #if SSR_USE_X86_ASM
 , ConvertFunc ssse3
 #endif
+#if SSR_USE_LOONGARCH_ASM
+, ConvertFunc lsx
+#endif
 ) {
 
 	std::mt19937 rng(12345);
@@ -195,7 +211,7 @@ void BenchmarkConvert(unsigned int w, unsigned int h, AVPixelFormat in_format, A
 	}
 
 	// run test
-	unsigned int time_swscale = 0, time_fallback = 0, time_ssse3 = 0;
+	unsigned int time_swscale = 0, time_fallback = 0, time_ssse3 = 0, time_lsx = 0;
 	{
 		SwsContext *sws = sws_getCachedContext(NULL,
 											   w, h, in_format,
@@ -237,14 +253,26 @@ void BenchmarkConvert(unsigned int w, unsigned int h, AVPixelFormat in_format, A
 		time_ssse3 = (t2 - t1) / run_size;
 	}
 #endif
+#if SSR_USE_LOONGARCH_ASM
+	if(CPUFeatures::HasLSX()) {
+		int64_t t1 = hrt_time_micro();
+		for(unsigned int i = 0; i < run_size; ++i) {
+			unsigned int ii = i % queue_size;
+			lsx(w, h, queue_in[ii]->m_data[0], queue_in[ii]->m_stride[0], queue_out[ii]->m_data.data(), queue_out[ii]->m_stride.data());
+		}
+		int64_t t2 = hrt_time_micro();
+		time_lsx = (t2 - t1) / run_size;
+	}
+#endif
 
 	// print result
 	QString size = QString("%1x%2").arg(w).arg(h);
-	Logger::LogInfo("[BenchmarkConvert] " + Logger::tr("%1 %2 to %3 %4  |  SWScale %5 us  |  Fallback %6 us (%7%)  |  SSSE3 %8 us (%9%)")
+	Logger::LogInfo("[BenchmarkConvert] " + Logger::tr("%1 %2 to %3 %4  |  SWScale %5 us  |  Fallback %6 us (%7%)  |  SSSE3 %8 us (%9%)  |  LSX %10 us (%11%)")
 					.arg(in_format_name).arg(size, 9).arg(out_format_name).arg(size, 9)
 					.arg(time_swscale, 6)
 					.arg(time_fallback, 6).arg(100 * time_fallback / time_swscale, 3)
-					.arg(time_ssse3, 6).arg(100 * time_ssse3 / time_fallback, 3));
+					.arg(time_ssse3, 6).arg(100 * time_ssse3 / time_fallback, 3)
+					.arg(time_lsx, 6).arg(100 * time_lsx / time_fallback, 3));
 
 }
 
@@ -264,6 +292,12 @@ void Benchmark() {
 	BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_YUV420P, "BGRA", "YUV420", NewImageBGRA, NewImageYUV420, Convert_BGRA_YUV420_Fallback           , Convert_BGRA_YUV420_SSSE3           );
 	BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_NV12   , "BGRA", "NV12  ", NewImageBGRA, NewImageNV12  , Convert_BGRA_NV12_Fallback             , Convert_BGRA_NV12_SSSE3             );
 	BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_BGR24  , "BGRA", "BGR   ", NewImageBGRA, NewImageBGR   , PlaneWrapper<Convert_BGRA_BGR_Fallback>, PlaneWrapper<Convert_BGRA_BGR_SSSE3>);
+#elif SSR_USE_LOONGARCH_ASM
+	BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_YUV444P, "BGRA", "YUV444", NewImageBGRA, NewImageYUV444, Convert_BGRA_YUV444_Fallback           , Convert_BGRA_YUV444_LSX           );
+	BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_YUV422P, "BGRA", "YUV422", NewImageBGRA, NewImageYUV422, Convert_BGRA_YUV422_Fallback           , Convert_BGRA_YUV422_LSX           );
+	BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_YUV420P, "BGRA", "YUV420", NewImageBGRA, NewImageYUV420, Convert_BGRA_YUV420_Fallback           , Convert_BGRA_YUV420_LSX           );
+	BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_NV12   , "BGRA", "NV12  ", NewImageBGRA, NewImageNV12  , Convert_BGRA_NV12_Fallback             , Convert_BGRA_NV12_LSX             );
+	BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_BGR24  , "BGRA", "BGR   ", NewImageBGRA, NewImageBGR   , PlaneWrapper<Convert_BGRA_BGR_Fallback>, PlaneWrapper<Convert_BGRA_BGR_LSX>);
 #else
 	BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_YUV444P, "BGRA", "YUV444", NewImageBGRA, NewImageYUV444, Convert_BGRA_YUV444_Fallback           );
 	BenchmarkConvert(1920, 1080, AV_PIX_FMT_BGRA, AV_PIX_FMT_YUV422P, "BGRA", "YUV422", NewImageBGRA, NewImageYUV422, Convert_BGRA_YUV422_Fallback           );
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e64d1748..e92a5165 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -32,6 +32,10 @@ else()
 	find_package(Qt4 4.8 COMPONENTS QtGui REQUIRED)
 endif()
 
+if(WITH_SIMDE)
+	find_package(SIMDE 0.8.3 REQUIRED)
+endif()
+
 set(sources
 	AV/Input/ALSAInput.cpp
 	AV/Input/ALSAInput.h
@@ -164,6 +168,23 @@ if(ENABLE_X86_ASM)
 
 endif()
 
+if(ENABLE_LOONGARCH_ASM)
+
+	list(APPEND sources
+		AV/FastResampler_FirFilter_LSX.cpp
+		AV/FastScaler_Convert_LSX.cpp
+		AV/FastScaler_Scale_LSX.cpp
+	)
+
+	set_source_files_properties(
+		AV/FastResampler_FirFilter_LSX.cpp
+		AV/FastScaler_Convert_LSX.cpp
+		AV/FastScaler_Scale_LSX.cpp
+		PROPERTIES COMPILE_FLAGS -mlsx
+	)
+
+endif()
+
 set(res_input
 	../data/resources/resources.qrc
 )
@@ -235,6 +256,7 @@ target_link_libraries(simplescreenrecorder PRIVATE
 
 target_compile_definitions(simplescreenrecorder PRIVATE
 	-DSSR_USE_X86_ASM=$<BOOL:${ENABLE_X86_ASM}>
+	-DSSR_USE_LOONGARCH_ASM=$<BOOL:$<AND:$<BOOL:${ENABLE_LOONGARCH_ASM}>,$<BOOL:${WITH_SIMDE}>>>
 	-DSSR_USE_FFMPEG_VERSIONS=$<BOOL:${ENABLE_FFMPEG_VERSIONS}>
 	-DSSR_USE_JACK_METADATA=$<BOOL:${ENABLE_JACK_METADATA}>
 	-DSSR_USE_OPENGL_RECORDING=$<BOOL:${WITH_OPENGL_RECORDING}>
diff --git a/src/Main.cpp b/src/Main.cpp
index 4afeae20..436c7edf 100644
--- a/src/Main.cpp
+++ b/src/Main.cpp
@@ -99,7 +99,7 @@ int main(int argc, char* argv[]) {
 	Logger::LogInfo("==================== " + Logger::tr("SSR started") + " ====================");
 	Logger::LogInfo(GetVersionInfo());
 
-#if SSR_USE_X86_ASM
+#if SSR_USE_X86_ASM || SSR_USE_LOONGARCH_ASM
 	// detect CPU features
 	CPUFeatures::Detect();
 #endif
diff --git a/src/common/CPUFeatures.cpp b/src/common/CPUFeatures.cpp
index 04bfa1a8..2786aa0d 100644
--- a/src/common/CPUFeatures.cpp
+++ b/src/common/CPUFeatures.cpp
@@ -74,3 +74,28 @@ void CPUFeatures::Detect() {
 }
 
 #endif // SSR_USE_X86_ASM
+
+#if SSR_USE_LOONGARCH_ASM
+
+#include <sys/auxv.h>
+
+#define LA_HWCAP_LSX    (1<<4)
+#define LA_HWCAP_LASX   (1<<5)
+
+bool CPUFeatures::s_lsx  = false;
+bool CPUFeatures::s_lasx = false;
+
+void CPUFeatures::Detect() {
+
+	QString str = "[CPUFeatures::Detect] " + Logger::tr("CPU features") + ":";
+
+	int flags = 0;
+	int flag  = (int)getauxval(AT_HWCAP);
+
+	if (flag & LA_HWCAP_LSX)  {s_lsx  = true; str += " lsx";}
+	if (flag & LA_HWCAP_LASX) {s_lasx = true; str += " lasx";}
+
+	Logger::LogInfo(str);
+}
+
+#endif // SSR_USE_LOONGARCH_ASM
diff --git a/src/common/CPUFeatures.h b/src/common/CPUFeatures.h
index 728503cf..b05bcb1e 100644
--- a/src/common/CPUFeatures.h
+++ b/src/common/CPUFeatures.h
@@ -48,3 +48,20 @@ class CPUFeatures {
 };
 
 #endif // SSR_USE_X86_ASM
+
+#if SSR_USE_LOONGARCH_ASM
+
+class CPUFeatures {
+
+private:
+	static bool s_lsx, s_lasx;
+
+public:
+	static void Detect();
+
+	inline static bool HasLSX()  { return s_lsx; }
+	inline static bool HasLASX() { return s_lasx; }
+
+};
+
+#endif // SSR_USE_LOONGARCH_ASM