diff --git a/src/audio/SDL_audiocvt.c b/src/audio/SDL_audiocvt.c index 36c631a2f..23d1906d7 100644 --- a/src/audio/SDL_audiocvt.c +++ b/src/audio/SDL_audiocvt.c @@ -52,6 +52,7 @@ static void SDLCALL SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format) { + const __m128 divby2 = _mm_set1_ps(0.5f); float *dst = (float *) cvt->buf; const float *src = dst; int i = cvt->len_cvt / 8; @@ -59,15 +60,12 @@ SDL_ConvertStereoToMono_SSE3(SDL_AudioCVT * cvt, SDL_AudioFormat format) LOG_DEBUG_CONVERT("stereo", "mono (using SSE3)"); SDL_assert(format == AUDIO_F32SYS); - /* We can only do this if dst is aligned to 16 bytes; since src is the - same pointer and it moves by 2, it can't be forcibly aligned. */ - if ((((size_t) dst) & 15) == 0) { - /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ - const __m128 divby2 = _mm_set1_ps(0.5f); - while (i >= 4) { /* 4 * float32 */ - _mm_store_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_load_ps(src), _mm_load_ps(src+4)), divby2)); - i -= 4; src += 8; dst += 4; - } + /* Do SSE blocks as long as we have 16 bytes available. + Just use unaligned load/stores, if the memory at runtime is + aligned it'll be just as fast on modern processors */ + while (i >= 4) { /* 4 * float32 */ + _mm_storeu_ps(dst, _mm_mul_ps(_mm_hadd_ps(_mm_load_ps(src), _mm_loadu_ps(src+4)), divby2)); + i -= 4; src += 8; dst += 4; } /* Finish off any leftovers with scalar operations. */