Xorg: simd changes

This commit is contained in:
Jay Sorg 2014-10-16 20:22:01 -07:00
parent 1de9164c0b
commit e8c61af838
9 changed files with 275 additions and 79 deletions

View File

@ -8,7 +8,7 @@ rdpCursor.o rdpMain.o rdpRandR.o rdpMisc.o rdpReg.o \
rdpComposite.o rdpGlyphs.o rdpPixmap.o rdpInput.o rdpClientCon.o rdpCapture.o \
rdpTrapezoids.o rdpXv.o rdpSimd.o
;OBJS += cpuid_x86.o i420_to_rgb32_x86_sse2.o yv12_to_rgb32_x86_sse2.o yuy2_to_rgb32_x86_sse2.o uyvy_to_rgb32_x86_sse2.o
;OBJS += cpuid_x86.o i420_to_rgb32_x86_sse2.o yv12_to_rgb32_x86_sse2.o yuy2_to_rgb32_x86_sse2.o uyvy_to_rgb32_x86_sse2.o a8r8g8b8_to_a8b8g8r8_box_x86_sse2.o
;OBJS += cpuid_amd64.o i420_to_rgb32_amd64_sse2.o yv12_to_rgb32_amd64_sse2.o yuy2_to_rgb32_amd64_sse2.o uyvy_to_rgb32_amd64_sse2.o
CFLAGS = -g -O2 -Wall -fPIC -I/usr/include/xorg -I/usr/include/pixman-1 \
@ -43,6 +43,9 @@ yuy2_to_rgb32_x86_sse2.o: x86/yuy2_to_rgb32_x86_sse2.asm
uyvy_to_rgb32_x86_sse2.o: x86/uyvy_to_rgb32_x86_sse2.asm
yasm -f elf32 -g dwarf2 x86/uyvy_to_rgb32_x86_sse2.asm
a8r8g8b8_to_a8b8g8r8_box_x86_sse2.o: x86/a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm
yasm -f elf32 -g dwarf2 x86/a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm
cpuid_amd64.o: amd64/cpuid_amd64.asm
yasm -f elf64 -g dwarf2 amd64/cpuid_amd64.asm

View File

@ -197,6 +197,10 @@ struct _rdpCounts
typedef int (*yuv_to_rgb32_proc)(unsigned char *yuvs, int width, int height, int *rgbs);
typedef int (*copy_box_proc)(char *s8, int src_stride,
char *d8, int dst_stride,
int width, int height);
/* move this to common header */
struct _rdpRec
{
@ -209,6 +213,7 @@ struct _rdpRec
int bitsPerPixel;
int Bpp;
int Bpp_mask;
char *pfbMemory_alloc;
char *pfbMemory;
ScreenPtr pScreen;
rdpDevPrivateKey privateKeyRecGC;
@ -277,6 +282,8 @@ struct _rdpRec
int xv_timer_schedualed;
OsTimerPtr xv_timer;
copy_box_proc a8r8g8b8_to_a8b8g8r8_box;
};
typedef struct _rdpRec rdpRec;
typedef struct _rdpRec * rdpPtr;

View File

@ -63,7 +63,8 @@ rdpLimitRects(RegionPtr reg, int max_rects, BoxPtr *rects)
/******************************************************************************/
/* copy rects with no error checking */
static int
rdpCopyBox_a8r8g8b8_to_a8r8g8b8(void *src, int src_stride, int srcx, int srcy,
rdpCopyBox_a8r8g8b8_to_a8r8g8b8(rdpClientCon *clientCon,
void *src, int src_stride, int srcx, int srcy,
void *dst, int dst_stride, int dstx, int dsty,
BoxPtr rects, int num_rects)
{
@ -87,7 +88,7 @@ rdpCopyBox_a8r8g8b8_to_a8r8g8b8(void *src, int src_stride, int srcx, int srcy,
height = box->y2 - box->y1;
for (jndex = 0; jndex < height; jndex++)
{
memcpy(d8, s8, bytes);
g_memcpy(d8, s8, bytes);
d8 += dst_stride;
s8 += src_stride;
}
@ -101,7 +102,7 @@ rdpFillBox_yuvalp(int ax, int ay,
void *dst, int dst_stride)
{
dst = ((char *) dst) + (ay << 8) * (dst_stride >> 8) + (ax << 8);
memset(dst, 0, 64 * 64 * 4);
g_memset(dst, 0, 64 * 64 * 4);
return 0;
}
@ -194,53 +195,67 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay,
return 0;
}
/******************************************************************************/
int
a8r8g8b8_to_a8b8g8r8_box(char *s8, int src_stride,
char *d8, int dst_stride,
int width, int height)
{
int index;
int jndex;
int red;
int green;
int blue;
unsigned int *s32;
unsigned int *d32;
for (index = 0; index < height; index++)
{
s32 = (unsigned int *) s8;
d32 = (unsigned int *) d8;
for (jndex = 0; jndex < width; jndex++)
{
SPLITCOLOR32(red, green, blue, *s32);
*d32 = COLOR24(red, green, blue);
s32++;
d32++;
}
d8 += dst_stride;
s8 += src_stride;
}
return 0;
}
/******************************************************************************/
/* copy rects with no error checking */
static int
rdpCopyBox_a8r8g8b8_to_a8b8g8r8(void *src, int src_stride,
void *dst, int dst_stride,
rdpCopyBox_a8r8g8b8_to_a8b8g8r8(rdpClientCon *clientCon,
void *src, int src_stride, int srcx, int srcy,
void *dst, int dst_stride, int dstx, int dsty,
BoxPtr rects, int num_rects)
{
char *s8;
char *d8;
int index;
int jndex;
int kndex;
int bytes;
int width;
int height;
int red;
int green;
int blue;
BoxPtr box;
unsigned int *s32;
unsigned int *d32;
copy_box_proc copy_box;
copy_box = clientCon->dev->a8r8g8b8_to_a8b8g8r8_box;
for (index = 0; index < num_rects; index++)
{
box = rects + index;
s8 = ((char *) src) + box->y1 * src_stride;
s8 += box->x1 * 4;
d8 = ((char *) dst) + box->y1 * dst_stride;
d8 += box->x1 * 4;
s8 = ((char *) src) + (box->y1 - srcy) * src_stride;
s8 += (box->x1 - srcx) * 4;
d8 = ((char *) dst) + (box->y1 - dsty) * dst_stride;
d8 += (box->x1 - dstx) * 4;
bytes = box->x2 - box->x1;
bytes *= 4;
width = box->x2 - box->x1;
height = box->y2 - box->y1;
for (jndex = 0; jndex < height; jndex++)
{
s32 = (unsigned int *) s8;
d32 = (unsigned int *) d8;
for (kndex = 0; kndex < width; kndex++)
{
SPLITCOLOR32(red, green, blue, *s32);
*d32 = COLOR24(red, green, blue);
s32++;
d32++;
}
d8 += dst_stride;
s8 += src_stride;
}
copy_box(s8, src_stride, d8, dst_stride, width, height);
}
return 0;
}
@ -283,8 +298,8 @@ rdpCapture0(rdpClientCon *clientCon,
rect.x1 = 0;
rect.y1 = 0;
rect.x2 = min(dst_width, src_width);
rect.y2 = min(dst_height, src_height);
rect.x2 = RDPMIN(dst_width, src_width);
rect.y2 = RDPMIN(dst_height, src_height);
rdpRegionInit(&reg, &rect, 0);
rdpRegionIntersect(&reg, in_reg, &reg);
@ -307,14 +322,16 @@ rdpCapture0(rdpClientCon *clientCon,
if ((src_format == XRDP_a8r8g8b8) && (dst_format == XRDP_a8r8g8b8))
{
rdpCopyBox_a8r8g8b8_to_a8r8g8b8(src, src_stride, 0, 0,
rdpCopyBox_a8r8g8b8_to_a8r8g8b8(clientCon,
src, src_stride, 0, 0,
dst, dst_stride, 0, 0,
psrc_rects, num_rects);
}
else if ((src_format == XRDP_a8r8g8b8) && (dst_format == XRDP_a8b8g8r8))
{
rdpCopyBox_a8r8g8b8_to_a8b8g8r8(src, src_stride,
dst, dst_stride,
rdpCopyBox_a8r8g8b8_to_a8b8g8r8(clientCon,
src, src_stride, 0, 0,
dst, dst_stride, 0, 0,
psrc_rects, num_rects);
}
else if ((src_format == XRDP_a8r8g8b8) && (dst_format == XRDP_r5g6b5))
@ -739,6 +756,7 @@ rdpCapture(rdpClientCon *clientCon,
int dst_stride, int dst_format, int mode)
{
LLOGLN(10, ("rdpCapture:"));
LLOGLN(10, ("rdpCapture: src %p dst %p", src, dst));
switch (mode)
{
case 0:

View File

@ -25,3 +25,9 @@ rdpCapture(rdpClientCon *clientCon,
int src_stride, int src_format,
void *dst, int dst_width, int dst_height,
int dst_stride, int dst_format, int mode);
int
a8r8g8b8_to_a8b8g8r8_box(char *s8, int src_stride,
char *d8, int dst_stride,
int width, int height);

View File

@ -35,6 +35,7 @@ SIMD function asign
#include "rdp.h"
#include "rdpXv.h"
#include "rdpCapture.h"
/* use simd, run time */
int g_simd_use_accel = 1;
@ -65,6 +66,11 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn)
dev = XRDPPTR(pScrn);
/* assign functions */
LLOGLN(0, ("rdpSimdInit: assigning yuv functions"));
dev->yv12_to_rgb32 = YV12_to_RGB32;
dev->i420_to_rgb32 = I420_to_RGB32;
dev->yuy2_to_rgb32 = YUY2_to_RGB32;
dev->uyvy_to_rgb32 = UYVY_to_RGB32;
dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box;
#if SIMD_USE_ACCEL
if (g_simd_use_accel)
{
@ -81,14 +87,6 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn)
dev->uyvy_to_rgb32 = uyvy_to_rgb32_amd64_sse2;
LLOGLN(0, ("rdpSimdInit: sse2 amd64 yuv functions assigned"));
}
else
{
dev->yv12_to_rgb32 = YV12_to_RGB32;
dev->i420_to_rgb32 = I420_to_RGB32;
dev->yuy2_to_rgb32 = YUY2_to_RGB32;
dev->uyvy_to_rgb32 = UYVY_to_RGB32;
LLOGLN(0, ("rdpSimdInit: warning, c yuv functions assigned"));
}
#elif defined(__x86__) || defined(_M_IX86) || defined(__i386__)
int ax, bx, cx, dx;
cpuid_x86(1, 0, &ax, &bx, &cx, &dx);
@ -100,38 +98,11 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn)
dev->i420_to_rgb32 = i420_to_rgb32_x86_sse2;
dev->yuy2_to_rgb32 = yuy2_to_rgb32_x86_sse2;
dev->uyvy_to_rgb32 = uyvy_to_rgb32_x86_sse2;
dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_x86_sse2;
LLOGLN(0, ("rdpSimdInit: sse2 x86 yuv functions assigned"));
}
else
{
dev->yv12_to_rgb32 = YV12_to_RGB32;
dev->i420_to_rgb32 = I420_to_RGB32;
dev->yuy2_to_rgb32 = YUY2_to_RGB32;
dev->uyvy_to_rgb32 = UYVY_to_RGB32;
LLOGLN(0, ("rdpSimdInit: warning, c yuv functions assigned"));
}
#else
dev->yv12_to_rgb32 = YV12_to_RGB32;
dev->i420_to_rgb32 = I420_to_RGB32;
dev->yuy2_to_rgb32 = YUY2_to_RGB32;
dev->uyvy_to_rgb32 = UYVY_to_RGB32;
LLOGLN(0, ("rdpSimdInit: warning, c yuv functions assigned"));
#endif
}
else
{
dev->yv12_to_rgb32 = YV12_to_RGB32;
dev->i420_to_rgb32 = I420_to_RGB32;
dev->yuy2_to_rgb32 = YUY2_to_RGB32;
dev->uyvy_to_rgb32 = UYVY_to_RGB32;
LLOGLN(0, ("rdpSimdInit: warning, c yuv functions assigned"));
}
#else
dev->yv12_to_rgb32 = YV12_to_RGB32;
dev->i420_to_rgb32 = I420_to_RGB32;
dev->yuy2_to_rgb32 = YUY2_to_RGB32;
dev->uyvy_to_rgb32 = UYVY_to_RGB32;
LLOGLN(0, ("rdpSimdInit: warning, c yuv functions assigned"));
#endif
return 1;
}

View File

@ -502,6 +502,7 @@ xrdpVidPutImage(ScrnInfoPtr pScrn,
rgbend32 = rgborg32 + width * height;
rgbend32 = (int *) RDPALIGN(rgbend32, 16);
error = 0;
switch (format)
{
case FOURCC_YV12:
@ -528,12 +529,21 @@ xrdpVidPutImage(ScrnInfoPtr pScrn,
{
return Success;
}
error = stretch_RGB32_RGB32(rgborg32, width, height,
src_x, src_y, src_w, src_h,
rgbend32, drw_w, drw_h);
if (error != 0)
if ((width == drw_w) && (height == drw_h))
{
return Success;
LLOGLN(10, ("xrdpVidPutImage: strech skip"));
rgbend32 = rgborg32;
}
else
{
error = stretch_RGB32_RGB32(rgborg32, width, height,
src_x, src_y, src_w, src_h,
rgbend32, drw_w, drw_h);
if (error != 0)
{
return Success;
}
}
tempGC = GetScratchGC(dst->depth, pScrn->pScreen);
@ -542,7 +552,8 @@ xrdpVidPutImage(ScrnInfoPtr pScrn,
ValidateGC(dst, tempGC);
(*tempGC->ops->PutImage)(dst, tempGC, 24,
drw_x - dst->x, drw_y - dst->y,
drw_w, drw_h, 0, ZPixmap, (char*)rgbend32);
drw_w, drw_h, 0, ZPixmap,
(char *) rgbend32);
FreeScratchGC(tempGC);
}

View File

@ -0,0 +1,174 @@
;
;Copyright 2014 Jay Sorg
;
;Permission to use, copy, modify, distribute, and sell this software and its
;documentation for any purpose is hereby granted without fee, provided that
;the above copyright notice appear in all copies and that both that
;copyright notice and this permission notice appear in supporting
;documentation.
;
;The above copyright notice and this permission notice shall be included in
;all copies or substantial portions of the Software.
;
;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
;
;ARGB to ABGR
;x86 SSE2 32 bit
;
SECTION .data
align 16
c1 times 4 dd 0xFF00FF00
c2 times 4 dd 0x00FF0000
c3 times 4 dd 0x000000FF
SECTION .text
%macro PROC 1
align 16
global %1
%1:
%endmacro
;int
;a8r8g8b8_to_a8b8g8r8_box_x86_sse2(char *s8, int src_stride,
; char *d8, int dst_stride,
; int width, int height);
PROC a8r8g8b8_to_a8b8g8r8_box_x86_sse2
push ebx
push esi
push edi
push ebp
movdqa xmm4, [c1]
movdqa xmm5, [c2]
movdqa xmm6, [c3]
mov esi, [esp + 20] ; src
mov edi, [esp + 28] ; dst
loop_y:
mov ecx, [esp + 36] ; width
loop_xpre:
mov eax, esi ; look for aligned
and eax, 0x0F ; we can jump to next
mov ebx, eax
mov eax, edi
and eax, 0x0F
or eax, ebx
cmp eax, 0
je done_loop_xpre
cmp ecx, 1
jl done_loop_x ; all done with this row
mov eax, [esi]
lea esi, [esi + 4]
mov edx, eax ; a and g
and edx, 0xFF00FF00
mov ebx, eax ; r
and ebx, 0x00FF0000
shr ebx, 16
or edx, ebx
mov ebx, eax ; b
and ebx, 0x000000FF
shl ebx, 16
or edx, ebx
mov [edi], edx
lea edi, [edi + 4]
dec ecx
jmp loop_xpre;
done_loop_xpre:
prefetchnta [esi]
; A R G B A R G B A R G B A R G B to
; A B G R A B G R A B G R A B G R
loop_x8:
cmp ecx, 8
jl done_loop_x8
prefetchnta [esi + 32]
movdqa xmm0, [esi]
lea esi, [esi + 16]
movdqa xmm3, xmm0 ; a and g
pand xmm3, xmm4
movdqa xmm1, xmm0 ; r
pand xmm1, xmm5
psrld xmm1, 16
por xmm3, xmm1
movdqa xmm1, xmm0 ; b
pand xmm1, xmm6
pslld xmm1, 16
por xmm3, xmm1
movdqa [edi], xmm3
lea edi, [edi + 16]
sub ecx, 4
movdqa xmm0, [esi]
lea esi, [esi + 16]
movdqa xmm3, xmm0 ; a and g
pand xmm3, xmm4
movdqa xmm1, xmm0 ; r
pand xmm1, xmm5
psrld xmm1, 16
por xmm3, xmm1
movdqa xmm1, xmm0 ; b
pand xmm1, xmm6
pslld xmm1, 16
por xmm3, xmm1
movdqa [edi], xmm3
lea edi, [edi + 16]
sub ecx, 4
jmp loop_x8;
done_loop_x8:
loop_x:
cmp ecx, 1
jl done_loop_x
mov eax, [esi]
lea esi, [esi + 4]
mov edx, eax ; a and g
and edx, 0xFF00FF00
mov ebx, eax ; r
and ebx, 0x00FF0000
shr ebx, 16
or edx, ebx
mov ebx, eax ; b
and ebx, 0x000000FF
shl ebx, 16
or edx, ebx
mov [edi], edx
lea edi, [edi + 4]
dec ecx
jmp loop_x;
done_loop_x:
mov esi, [esp + 20]
add esi, [esp + 24]
mov [esp + 20], esi
mov edi, [esp + 28]
add edi, [esp + 32]
mov [esp + 28], edi
mov ecx, [esp + 40] ; height
dec ecx
mov [esp + 40], ecx
jnz loop_y
mov eax, 0 ; return value
pop ebp
pop edi
pop esi
pop ebx
ret
align 16

View File

@ -34,6 +34,10 @@ int
yuy2_to_rgb32_x86_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
int
uyvy_to_rgb32_x86_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
int
a8r8g8b8_to_a8b8g8r8_box_x86_sse2(char *s8, int src_stride,
char *d8, int dst_stride,
int width, int height);
#endif

View File

@ -436,7 +436,9 @@ rdpScreenInit(ScreenPtr pScreen, int argc, char **argv)
dev->bitsPerPixel = rdpBitsPerPixel(dev->depth);
dev->sizeInBytes = dev->paddedWidthInBytes * dev->height;
LLOGLN(0, ("rdpScreenInit: pfbMemory bytes %d", dev->sizeInBytes));
dev->pfbMemory = (char *) g_malloc(dev->sizeInBytes, 1);
dev->pfbMemory_alloc = (char *) g_malloc(dev->sizeInBytes + 16, 1);
dev->pfbMemory = (char*) RDPALIGN(dev->pfbMemory_alloc, 16);
LLOGLN(0, ("rdpScreenInit: pfbMemory %p", dev->pfbMemory));
if (!fbScreenInit(pScreen, dev->pfbMemory,
pScrn->virtualX, pScrn->virtualY,
pScrn->xDpi, pScrn->yDpi, pScrn->displayWidth,