mirror of
https://github.com/Relintai/sdl2_frt.git
synced 2024-11-21 20:47:19 +01:00
audiocvt: stereo-to-mono SSE3 now uses unaligned accesses.
On modern CPUs, there's no penalty for using the unaligned instruction on aligned memory, but now it can vectorize unaligned data too, which even if it's not optimal, is still going to be faster than the scalar fallback. Fixes #4532.
This commit is contained in:
parent
a894ce4c14
commit
8d790b10f8
@ -52,6 +52,7 @@
|
|||||||
static void SDLCALL
|
static void SDLCALL
|
||||||
SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
|
SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
|
||||||
{
|
{
|
||||||
|
const __m128 divby2 = _mm_set1_ps(0.5f);
|
||||||
float *dst = (float *) cvt->buf;
|
float *dst = (float *) cvt->buf;
|
||||||
const float *src = dst;
|
const float *src = dst;
|
||||||
int i = cvt->len_cvt / 8;
|
int i = cvt->len_cvt / 8;
|
||||||
@ -59,15 +60,12 @@ SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format)
|
|||||||
LOG_DEBUG_CONVERT("stereo", "mono (using SSE3)");
|
LOG_DEBUG_CONVERT("stereo", "mono (using SSE3)");
|
||||||
SDL_assert(format == AUDIO_F32SYS);
|
SDL_assert(format == AUDIO_F32SYS);
|
||||||
|
|
||||||
/* We can only do this if dst is aligned to 16 bytes; since src is the
|
/* Do SSE blocks as long as we have 16 bytes available.
|
||||||
same pointer and it moves by 2, it can't be forcibly aligned. */
|
Just use unaligned load/stores, if the memory at runtime is
|
||||||
if ((((size_t) dst) & 15) == 0) {
|
aligned it'll be just as fast on modern processors */
|
||||||
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */
|
while (i >= 4) { /* 4 * float32 */
|
||||||
const __m128 divby2 = _mm_set1_ps(0.5f);
|
_mm_storeu_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_load_ps(src), _mm_loadu_ps(src+4)), divby2));
|
||||||
while (i >= 4) { /* 4 * float32 */
|
i -= 4; src += 8; dst += 4;
|
||||||
_mm_store_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_load_ps(src), _mm_load_ps(src+4)), divby2));
|
|
||||||
i -= 4; src += 8; dst += 4;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Finish off any leftovers with scalar operations. */
|
/* Finish off any leftovers with scalar operations. */
|
||||||
|
Loading…
Reference in New Issue
Block a user