From e8c61af8384f33d3d9ef53983f35d6a6badc6cc3 Mon Sep 17 00:00:00 2001 From: Jay Sorg Date: Thu, 16 Oct 2014 20:22:01 -0700 Subject: [PATCH] Xorg: simd changes --- xorg/server/module/Makefile | 5 +- xorg/server/module/rdp.h | 7 + xorg/server/module/rdpCapture.c | 88 +++++---- xorg/server/module/rdpCapture.h | 6 + xorg/server/module/rdpSimd.c | 43 +---- xorg/server/module/rdpXv.c | 23 ++- .../x86/a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm | 174 ++++++++++++++++++ xorg/server/module/x86/funcs_x86.h | 4 + xorg/server/xrdpdev/xrdpdev.c | 4 +- 9 files changed, 275 insertions(+), 79 deletions(-) create mode 100644 xorg/server/module/x86/a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm diff --git a/xorg/server/module/Makefile b/xorg/server/module/Makefile index 8f1560d6..afcbc443 100644 --- a/xorg/server/module/Makefile +++ b/xorg/server/module/Makefile @@ -8,7 +8,7 @@ rdpCursor.o rdpMain.o rdpRandR.o rdpMisc.o rdpReg.o \ rdpComposite.o rdpGlyphs.o rdpPixmap.o rdpInput.o rdpClientCon.o rdpCapture.o \ rdpTrapezoids.o rdpXv.o rdpSimd.o -;OBJS += cpuid_x86.o i420_to_rgb32_x86_sse2.o yv12_to_rgb32_x86_sse2.o yuy2_to_rgb32_x86_sse2.o uyvy_to_rgb32_x86_sse2.o +;OBJS += cpuid_x86.o i420_to_rgb32_x86_sse2.o yv12_to_rgb32_x86_sse2.o yuy2_to_rgb32_x86_sse2.o uyvy_to_rgb32_x86_sse2.o a8r8g8b8_to_a8b8g8r8_box_x86_sse2.o ;OBJS += cpuid_amd64.o i420_to_rgb32_amd64_sse2.o yv12_to_rgb32_amd64_sse2.o yuy2_to_rgb32_amd64_sse2.o uyvy_to_rgb32_amd64_sse2.o CFLAGS = -g -O2 -Wall -fPIC -I/usr/include/xorg -I/usr/include/pixman-1 \ @@ -43,6 +43,9 @@ yuy2_to_rgb32_x86_sse2.o: x86/yuy2_to_rgb32_x86_sse2.asm uyvy_to_rgb32_x86_sse2.o: x86/uyvy_to_rgb32_x86_sse2.asm yasm -f elf32 -g dwarf2 x86/uyvy_to_rgb32_x86_sse2.asm +a8r8g8b8_to_a8b8g8r8_box_x86_sse2.o: x86/a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm + yasm -f elf32 -g dwarf2 x86/a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm + cpuid_amd64.o: amd64/cpuid_amd64.asm yasm -f elf64 -g dwarf2 amd64/cpuid_amd64.asm diff --git a/xorg/server/module/rdp.h b/xorg/server/module/rdp.h index 8a4d58c4..b39ebd3a 100644 --- a/xorg/server/module/rdp.h +++ b/xorg/server/module/rdp.h @@ -197,6 +197,10 @@ struct _rdpCounts typedef int (*yuv_to_rgb32_proc)(unsigned char *yuvs, int width, int height, int *rgbs); +typedef int (*copy_box_proc)(char *s8, int src_stride, + char *d8, int dst_stride, + int width, int height); + /* move this to common header */ struct _rdpRec { @@ -209,6 +213,7 @@ struct _rdpRec int bitsPerPixel; int Bpp; int Bpp_mask; + char *pfbMemory_alloc; char *pfbMemory; ScreenPtr pScreen; rdpDevPrivateKey privateKeyRecGC; @@ -277,6 +282,8 @@ struct _rdpRec int xv_timer_schedualed; OsTimerPtr xv_timer; + copy_box_proc a8r8g8b8_to_a8b8g8r8_box; + }; typedef struct _rdpRec rdpRec; typedef struct _rdpRec * rdpPtr; diff --git a/xorg/server/module/rdpCapture.c b/xorg/server/module/rdpCapture.c index 5163e6ae..d29c8608 100644 --- a/xorg/server/module/rdpCapture.c +++ b/xorg/server/module/rdpCapture.c @@ -63,7 +63,8 @@ rdpLimitRects(RegionPtr reg, int max_rects, BoxPtr *rects) /******************************************************************************/ /* copy rects with no error checking */ static int -rdpCopyBox_a8r8g8b8_to_a8r8g8b8(void *src, int src_stride, int srcx, int srcy, +rdpCopyBox_a8r8g8b8_to_a8r8g8b8(rdpClientCon *clientCon, + void *src, int src_stride, int srcx, int srcy, void *dst, int dst_stride, int dstx, int dsty, BoxPtr rects, int num_rects) { @@ -87,7 +88,7 @@ rdpCopyBox_a8r8g8b8_to_a8r8g8b8(void *src, int src_stride, int srcx, int srcy, height = box->y2 - box->y1; for (jndex = 0; jndex < height; jndex++) { - memcpy(d8, s8, bytes); + g_memcpy(d8, s8, bytes); d8 += dst_stride; s8 += src_stride; } @@ -101,7 +102,7 @@ rdpFillBox_yuvalp(int ax, int ay, void *dst, int dst_stride) { dst = ((char *) dst) + (ay << 8) * (dst_stride >> 8) + (ax << 8); - memset(dst, 0, 64 * 64 * 4); + g_memset(dst, 0, 64 * 64 * 4); return 0; } @@ -194,53 +195,67 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay, return 0; } +/******************************************************************************/ +int +a8r8g8b8_to_a8b8g8r8_box(char *s8, int src_stride, + char *d8, int dst_stride, + int width, int height) +{ + int index; + int jndex; + int red; + int green; + int blue; + unsigned int *s32; + unsigned int *d32; + + for (index = 0; index < height; index++) + { + s32 = (unsigned int *) s8; + d32 = (unsigned int *) d8; + for (jndex = 0; jndex < width; jndex++) + { + SPLITCOLOR32(red, green, blue, *s32); + *d32 = COLOR24(red, green, blue); + s32++; + d32++; + } + d8 += dst_stride; + s8 += src_stride; + } + return 0; +} + /******************************************************************************/ /* copy rects with no error checking */ static int -rdpCopyBox_a8r8g8b8_to_a8b8g8r8(void *src, int src_stride, - void *dst, int dst_stride, +rdpCopyBox_a8r8g8b8_to_a8b8g8r8(rdpClientCon *clientCon, + void *src, int src_stride, int srcx, int srcy, + void *dst, int dst_stride, int dstx, int dsty, BoxPtr rects, int num_rects) { char *s8; char *d8; int index; - int jndex; - int kndex; int bytes; int width; int height; - int red; - int green; - int blue; BoxPtr box; - unsigned int *s32; - unsigned int *d32; + copy_box_proc copy_box; + copy_box = clientCon->dev->a8r8g8b8_to_a8b8g8r8_box; for (index = 0; index < num_rects; index++) { box = rects + index; - s8 = ((char *) src) + box->y1 * src_stride; - s8 += box->x1 * 4; - d8 = ((char *) dst) + box->y1 * dst_stride; - d8 += box->x1 * 4; + s8 = ((char *) src) + (box->y1 - srcy) * src_stride; + s8 += (box->x1 - srcx) * 4; + d8 = ((char *) dst) + (box->y1 - dsty) * dst_stride; + d8 += (box->x1 - dstx) * 4; bytes = box->x2 - box->x1; bytes *= 4; width = box->x2 - box->x1; height = box->y2 - box->y1; - for (jndex = 0; jndex < height; jndex++) - { - s32 = (unsigned int *) s8; - d32 = (unsigned int *) d8; - for (kndex = 0; kndex < width; kndex++) - { - SPLITCOLOR32(red, green, blue, *s32); - *d32 = COLOR24(red, green, blue); - s32++; - d32++; - } - d8 += dst_stride; - s8 += src_stride; - } + copy_box(s8, src_stride, d8, dst_stride, width, height); } return 0; } @@ -283,8 +298,8 @@ rdpCapture0(rdpClientCon *clientCon, rect.x1 = 0; rect.y1 = 0; - rect.x2 = min(dst_width, src_width); - rect.y2 = min(dst_height, src_height); + rect.x2 = RDPMIN(dst_width, src_width); + rect.y2 = RDPMIN(dst_height, src_height); rdpRegionInit(®, &rect, 0); rdpRegionIntersect(®, in_reg, ®); @@ -307,14 +322,16 @@ rdpCapture0(rdpClientCon *clientCon, if ((src_format == XRDP_a8r8g8b8) && (dst_format == XRDP_a8r8g8b8)) { - rdpCopyBox_a8r8g8b8_to_a8r8g8b8(src, src_stride, 0, 0, + rdpCopyBox_a8r8g8b8_to_a8r8g8b8(clientCon, + src, src_stride, 0, 0, dst, dst_stride, 0, 0, psrc_rects, num_rects); } else if ((src_format == XRDP_a8r8g8b8) && (dst_format == XRDP_a8b8g8r8)) { - rdpCopyBox_a8r8g8b8_to_a8b8g8r8(src, src_stride, - dst, dst_stride, + rdpCopyBox_a8r8g8b8_to_a8b8g8r8(clientCon, + src, src_stride, 0, 0, + dst, dst_stride, 0, 0, psrc_rects, num_rects); } else if ((src_format == XRDP_a8r8g8b8) && (dst_format == XRDP_r5g6b5)) @@ -739,6 +756,7 @@ rdpCapture(rdpClientCon *clientCon, int dst_stride, int dst_format, int mode) { LLOGLN(10, ("rdpCapture:")); + LLOGLN(10, ("rdpCapture: src %p dst %p", src, dst)); switch (mode) { case 0: diff --git a/xorg/server/module/rdpCapture.h b/xorg/server/module/rdpCapture.h index 4dff1eea..8b4d4615 100644 --- a/xorg/server/module/rdpCapture.h +++ b/xorg/server/module/rdpCapture.h @@ -25,3 +25,9 @@ rdpCapture(rdpClientCon *clientCon, int src_stride, int src_format, void *dst, int dst_width, int dst_height, int dst_stride, int dst_format, int mode); + +int +a8r8g8b8_to_a8b8g8r8_box(char *s8, int src_stride, + char *d8, int dst_stride, + int width, int height); + diff --git a/xorg/server/module/rdpSimd.c b/xorg/server/module/rdpSimd.c index 7215bf86..849ab7a2 100644 --- a/xorg/server/module/rdpSimd.c +++ b/xorg/server/module/rdpSimd.c @@ -35,6 +35,7 @@ SIMD function asign #include "rdp.h" #include "rdpXv.h" +#include "rdpCapture.h" /* use simd, run time */ int g_simd_use_accel = 1; @@ -65,6 +66,11 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev = XRDPPTR(pScrn); /* assign functions */ LLOGLN(0, ("rdpSimdInit: assigning yuv functions")); + dev->yv12_to_rgb32 = YV12_to_RGB32; + dev->i420_to_rgb32 = I420_to_RGB32; + dev->yuy2_to_rgb32 = YUY2_to_RGB32; + dev->uyvy_to_rgb32 = UYVY_to_RGB32; + dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box; #if SIMD_USE_ACCEL if (g_simd_use_accel) { @@ -81,14 +87,6 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->uyvy_to_rgb32 = uyvy_to_rgb32_amd64_sse2; LLOGLN(0, ("rdpSimdInit: sse2 amd64 yuv functions assigned")); } - else - { - dev->yv12_to_rgb32 = YV12_to_RGB32; - dev->i420_to_rgb32 = I420_to_RGB32; - dev->yuy2_to_rgb32 = YUY2_to_RGB32; - dev->uyvy_to_rgb32 = UYVY_to_RGB32; - LLOGLN(0, ("rdpSimdInit: warning, c yuv functions assigned")); - } #elif defined(__x86__) || defined(_M_IX86) || defined(__i386__) int ax, bx, cx, dx; cpuid_x86(1, 0, &ax, &bx, &cx, &dx); @@ -100,38 +98,11 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->i420_to_rgb32 = i420_to_rgb32_x86_sse2; dev->yuy2_to_rgb32 = yuy2_to_rgb32_x86_sse2; dev->uyvy_to_rgb32 = uyvy_to_rgb32_x86_sse2; + dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_x86_sse2; LLOGLN(0, ("rdpSimdInit: sse2 x86 yuv functions assigned")); } - else - { - dev->yv12_to_rgb32 = YV12_to_RGB32; - dev->i420_to_rgb32 = I420_to_RGB32; - dev->yuy2_to_rgb32 = YUY2_to_RGB32; - dev->uyvy_to_rgb32 = UYVY_to_RGB32; - LLOGLN(0, ("rdpSimdInit: warning, c yuv functions assigned")); - } -#else - dev->yv12_to_rgb32 = YV12_to_RGB32; - dev->i420_to_rgb32 = I420_to_RGB32; - dev->yuy2_to_rgb32 = YUY2_to_RGB32; - dev->uyvy_to_rgb32 = UYVY_to_RGB32; - LLOGLN(0, ("rdpSimdInit: warning, c yuv functions assigned")); #endif } - else - { - dev->yv12_to_rgb32 = YV12_to_RGB32; - dev->i420_to_rgb32 = I420_to_RGB32; - dev->yuy2_to_rgb32 = YUY2_to_RGB32; - dev->uyvy_to_rgb32 = UYVY_to_RGB32; - LLOGLN(0, ("rdpSimdInit: warning, c yuv functions assigned")); - } -#else - dev->yv12_to_rgb32 = YV12_to_RGB32; - dev->i420_to_rgb32 = I420_to_RGB32; - dev->yuy2_to_rgb32 = YUY2_to_RGB32; - dev->uyvy_to_rgb32 = UYVY_to_RGB32; - LLOGLN(0, ("rdpSimdInit: warning, c yuv functions assigned")); #endif return 1; } diff --git a/xorg/server/module/rdpXv.c b/xorg/server/module/rdpXv.c index 1557f892..c820e299 100644 --- a/xorg/server/module/rdpXv.c +++ b/xorg/server/module/rdpXv.c @@ -502,6 +502,7 @@ xrdpVidPutImage(ScrnInfoPtr pScrn, rgbend32 = rgborg32 + width * height; rgbend32 = (int *) RDPALIGN(rgbend32, 16); error = 0; + switch (format) { case FOURCC_YV12: @@ -528,12 +529,21 @@ xrdpVidPutImage(ScrnInfoPtr pScrn, { return Success; } - error = stretch_RGB32_RGB32(rgborg32, width, height, - src_x, src_y, src_w, src_h, - rgbend32, drw_w, drw_h); - if (error != 0) + if ((width == drw_w) && (height == drw_h)) { - return Success; + LLOGLN(10, ("xrdpVidPutImage: strech skip")); + rgbend32 = rgborg32; + } + else + { + error = stretch_RGB32_RGB32(rgborg32, width, height, + src_x, src_y, src_w, src_h, + rgbend32, drw_w, drw_h); + if (error != 0) + { + return Success; + } + } tempGC = GetScratchGC(dst->depth, pScrn->pScreen); @@ -542,7 +552,8 @@ xrdpVidPutImage(ScrnInfoPtr pScrn, ValidateGC(dst, tempGC); (*tempGC->ops->PutImage)(dst, tempGC, 24, drw_x - dst->x, drw_y - dst->y, - drw_w, drw_h, 0, ZPixmap, (char*)rgbend32); + drw_w, drw_h, 0, ZPixmap, + (char *) rgbend32); FreeScratchGC(tempGC); } diff --git a/xorg/server/module/x86/a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm b/xorg/server/module/x86/a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm new file mode 100644 index 00000000..72563214 --- /dev/null +++ b/xorg/server/module/x86/a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm @@ -0,0 +1,174 @@ +; +;Copyright 2014 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to ABGR +;x86 SSE2 32 bit +; + +SECTION .data +align 16 +c1 times 4 dd 0xFF00FF00 +c2 times 4 dd 0x00FF0000 +c3 times 4 dd 0x000000FF + +SECTION .text + +%macro PROC 1 + align 16 + global %1 + %1: +%endmacro + +;int +;a8r8g8b8_to_a8b8g8r8_box_x86_sse2(char *s8, int src_stride, +; char *d8, int dst_stride, +; int width, int height); +PROC a8r8g8b8_to_a8b8g8r8_box_x86_sse2 + push ebx + push esi + push edi + push ebp + + movdqa xmm4, [c1] + movdqa xmm5, [c2] + movdqa xmm6, [c3] + + mov esi, [esp + 20] ; src + mov edi, [esp + 28] ; dst + +loop_y: + mov ecx, [esp + 36] ; width + +loop_xpre: + mov eax, esi ; look for aligned + and eax, 0x0F ; we can jump to next + mov ebx, eax + mov eax, edi + and eax, 0x0F + or eax, ebx + cmp eax, 0 + je done_loop_xpre + cmp ecx, 1 + jl done_loop_x ; all done with this row + mov eax, [esi] + lea esi, [esi + 4] + mov edx, eax ; a and g + and edx, 0xFF00FF00 + mov ebx, eax ; r + and ebx, 0x00FF0000 + shr ebx, 16 + or edx, ebx + mov ebx, eax ; b + and ebx, 0x000000FF + shl ebx, 16 + or edx, ebx + mov [edi], edx + lea edi, [edi + 4] + dec ecx + jmp loop_xpre; +done_loop_xpre: + + prefetchnta [esi] + +; A R G B A R G B A R G B A R G B to +; A B G R A B G R A B G R A B G R + +loop_x8: + cmp ecx, 8 + jl done_loop_x8 + + prefetchnta [esi + 32] + + movdqa xmm0, [esi] + lea esi, [esi + 16] + movdqa xmm3, xmm0 ; a and g + pand xmm3, xmm4 + movdqa xmm1, xmm0 ; r + pand xmm1, xmm5 + psrld xmm1, 16 + por xmm3, xmm1 + movdqa xmm1, xmm0 ; b + pand xmm1, xmm6 + pslld xmm1, 16 + por xmm3, xmm1 + movdqa [edi], xmm3 + lea edi, [edi + 16] + sub ecx, 4 + + movdqa xmm0, [esi] + lea esi, [esi + 16] + movdqa xmm3, xmm0 ; a and g + pand xmm3, xmm4 + movdqa xmm1, xmm0 ; r + pand xmm1, xmm5 + psrld xmm1, 16 + por xmm3, xmm1 + movdqa xmm1, xmm0 ; b + pand xmm1, xmm6 + pslld xmm1, 16 + por xmm3, xmm1 + movdqa [edi], xmm3 + lea edi, [edi + 16] + sub ecx, 4 + + jmp loop_x8; +done_loop_x8: + +loop_x: + cmp ecx, 1 + jl done_loop_x + mov eax, [esi] + lea esi, [esi + 4] + mov edx, eax ; a and g + and edx, 0xFF00FF00 + mov ebx, eax ; r + and ebx, 0x00FF0000 + shr ebx, 16 + or edx, ebx + mov ebx, eax ; b + and ebx, 0x000000FF + shl ebx, 16 + or edx, ebx + mov [edi], edx + lea edi, [edi + 4] + dec ecx + jmp loop_x; +done_loop_x: + + mov esi, [esp + 20] + add esi, [esp + 24] + mov [esp + 20], esi + + mov edi, [esp + 28] + add edi, [esp + 32] + mov [esp + 28], edi + + mov ecx, [esp + 40] ; height + dec ecx + mov [esp + 40], ecx + jnz loop_y + + mov eax, 0 ; return value + pop ebp + pop edi + pop esi + pop ebx + ret + align 16 + diff --git a/xorg/server/module/x86/funcs_x86.h b/xorg/server/module/x86/funcs_x86.h index 00724e62..775dd12d 100644 --- a/xorg/server/module/x86/funcs_x86.h +++ b/xorg/server/module/x86/funcs_x86.h @@ -34,6 +34,10 @@ int yuy2_to_rgb32_x86_sse2(unsigned char *yuvs, int width, int height, int *rgbs); int uyvy_to_rgb32_x86_sse2(unsigned char *yuvs, int width, int height, int *rgbs); +int +a8r8g8b8_to_a8b8g8r8_box_x86_sse2(char *s8, int src_stride, + char *d8, int dst_stride, + int width, int height); #endif diff --git a/xorg/server/xrdpdev/xrdpdev.c b/xorg/server/xrdpdev/xrdpdev.c index 9aa4ca2a..034e2fb4 100644 --- a/xorg/server/xrdpdev/xrdpdev.c +++ b/xorg/server/xrdpdev/xrdpdev.c @@ -436,7 +436,9 @@ rdpScreenInit(ScreenPtr pScreen, int argc, char **argv) dev->bitsPerPixel = rdpBitsPerPixel(dev->depth); dev->sizeInBytes = dev->paddedWidthInBytes * dev->height; LLOGLN(0, ("rdpScreenInit: pfbMemory bytes %d", dev->sizeInBytes)); - dev->pfbMemory = (char *) g_malloc(dev->sizeInBytes, 1); + dev->pfbMemory_alloc = (char *) g_malloc(dev->sizeInBytes + 16, 1); + dev->pfbMemory = (char*) RDPALIGN(dev->pfbMemory_alloc, 16); + LLOGLN(0, ("rdpScreenInit: pfbMemory %p", dev->pfbMemory)); if (!fbScreenInit(pScreen, dev->pfbMemory, pScrn->virtualX, pScrn->virtualY, pScrn->xDpi, pScrn->yDpi, pScrn->displayWidth,