35 #if COMPILE_TEMPLATE_AMD3DNOW
36 #define PREFETCH "prefetch"
37 #define PAVGB "pavgusb"
38 #elif COMPILE_TEMPLATE_MMXEXT
39 #define PREFETCH "prefetchnta"
42 #define PREFETCH " # nop"
45 #if COMPILE_TEMPLATE_AMD3DNOW
52 #if COMPILE_TEMPLATE_MMXEXT
53 #define MOVNTQ "movntq"
54 #define SFENCE "sfence"
57 #define SFENCE " # nop"
60 #if !COMPILE_TEMPLATE_SSE2
62 #if !COMPILE_TEMPLATE_AMD3DNOW
71 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
73 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask32a):
"memory");
77 "movd (%1), %%mm0 \n\t"
78 "punpckldq 3(%1), %%mm0 \n\t"
79 "movd 6(%1), %%mm1 \n\t"
80 "punpckldq 9(%1), %%mm1 \n\t"
81 "movd 12(%1), %%mm2 \n\t"
82 "punpckldq 15(%1), %%mm2 \n\t"
83 "movd 18(%1), %%mm3 \n\t"
84 "punpckldq 21(%1), %%mm3 \n\t"
85 "por %%mm7, %%mm0 \n\t"
86 "por %%mm7, %%mm1 \n\t"
87 "por %%mm7, %%mm2 \n\t"
88 "por %%mm7, %%mm3 \n\t"
91 MOVNTQ" %%mm2, 16(%0) \n\t"
98 __asm__
volatile(
SFENCE:::
"memory");
99 __asm__
volatile(
EMMS:::
"memory");
108 #define STORE_BGR24_MMX \
109 "psrlq $8, %%mm2 \n\t" \
110 "psrlq $8, %%mm3 \n\t" \
111 "psrlq $8, %%mm6 \n\t" \
112 "psrlq $8, %%mm7 \n\t" \
113 "pand "MANGLE(mask24l)", %%mm0\n\t" \
114 "pand "MANGLE(mask24l)", %%mm1\n\t" \
115 "pand "MANGLE(mask24l)", %%mm4\n\t" \
116 "pand "MANGLE(mask24l)", %%mm5\n\t" \
117 "pand "MANGLE(mask24h)", %%mm2\n\t" \
118 "pand "MANGLE(mask24h)", %%mm3\n\t" \
119 "pand "MANGLE(mask24h)", %%mm6\n\t" \
120 "pand "MANGLE(mask24h)", %%mm7\n\t" \
121 "por %%mm2, %%mm0 \n\t" \
122 "por %%mm3, %%mm1 \n\t" \
123 "por %%mm6, %%mm4 \n\t" \
124 "por %%mm7, %%mm5 \n\t" \
126 "movq %%mm1, %%mm2 \n\t" \
127 "movq %%mm4, %%mm3 \n\t" \
128 "psllq $48, %%mm2 \n\t" \
129 "psllq $32, %%mm3 \n\t" \
130 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
131 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
132 "por %%mm2, %%mm0 \n\t" \
133 "psrlq $16, %%mm1 \n\t" \
134 "psrlq $32, %%mm4 \n\t" \
135 "psllq $16, %%mm5 \n\t" \
136 "por %%mm3, %%mm1 \n\t" \
137 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
138 "por %%mm5, %%mm4 \n\t" \
140 MOVNTQ" %%mm0, (%0) \n\t" \
141 MOVNTQ" %%mm1, 8(%0) \n\t" \
142 MOVNTQ" %%mm4, 16(%0)"
152 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
157 "movq (%1), %%mm0 \n\t"
158 "movq 8(%1), %%mm1 \n\t"
159 "movq 16(%1), %%mm4 \n\t"
160 "movq 24(%1), %%mm5 \n\t"
161 "movq %%mm0, %%mm2 \n\t"
162 "movq %%mm1, %%mm3 \n\t"
163 "movq %%mm4, %%mm6 \n\t"
164 "movq %%mm5, %%mm7 \n\t"
171 __asm__
volatile(
SFENCE:::
"memory");
172 __asm__
volatile(
EMMS:::
"memory");
194 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
195 __asm__
volatile(
"movq %0, %%mm4"::
"m"(mask15s));
200 "movq (%1), %%mm0 \n\t"
201 "movq 8(%1), %%mm2 \n\t"
202 "movq %%mm0, %%mm1 \n\t"
203 "movq %%mm2, %%mm3 \n\t"
204 "pand %%mm4, %%mm0 \n\t"
205 "pand %%mm4, %%mm2 \n\t"
206 "paddw %%mm1, %%mm0 \n\t"
207 "paddw %%mm3, %%mm2 \n\t"
215 __asm__
volatile(
SFENCE:::
"memory");
216 __asm__
volatile(
EMMS:::
"memory");
219 register unsigned x= *((
const uint32_t *)s);
220 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
225 register unsigned short x= *((
const uint16_t *)s);
226 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
237 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
238 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask15rg));
239 __asm__
volatile(
"movq %0, %%mm6"::
"m"(mask15b));
244 "movq (%1), %%mm0 \n\t"
245 "movq 8(%1), %%mm2 \n\t"
246 "movq %%mm0, %%mm1 \n\t"
247 "movq %%mm2, %%mm3 \n\t"
248 "psrlq $1, %%mm0 \n\t"
249 "psrlq $1, %%mm2 \n\t"
250 "pand %%mm7, %%mm0 \n\t"
251 "pand %%mm7, %%mm2 \n\t"
252 "pand %%mm6, %%mm1 \n\t"
253 "pand %%mm6, %%mm3 \n\t"
254 "por %%mm1, %%mm0 \n\t"
255 "por %%mm3, %%mm2 \n\t"
263 __asm__
volatile(
SFENCE:::
"memory");
264 __asm__
volatile(
EMMS:::
"memory");
267 register uint32_t x= *((
const uint32_t*)s);
268 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
273 register uint16_t x= *((
const uint16_t*)s);
274 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
283 uint16_t *d = (uint16_t *)dst;
287 "movq %3, %%mm5 \n\t"
288 "movq %4, %%mm6 \n\t"
289 "movq %5, %%mm7 \n\t"
294 "movd (%1), %%mm0 \n\t"
295 "movd 4(%1), %%mm3 \n\t"
296 "punpckldq 8(%1), %%mm0 \n\t"
297 "punpckldq 12(%1), %%mm3 \n\t"
298 "movq %%mm0, %%mm1 \n\t"
299 "movq %%mm3, %%mm4 \n\t"
300 "pand %%mm6, %%mm0 \n\t"
301 "pand %%mm6, %%mm3 \n\t"
302 "pmaddwd %%mm7, %%mm0 \n\t"
303 "pmaddwd %%mm7, %%mm3 \n\t"
304 "pand %%mm5, %%mm1 \n\t"
305 "pand %%mm5, %%mm4 \n\t"
306 "por %%mm1, %%mm0 \n\t"
307 "por %%mm4, %%mm3 \n\t"
308 "psrld $5, %%mm0 \n\t"
309 "pslld $11, %%mm3 \n\t"
310 "por %%mm3, %%mm0 \n\t"
318 :
"r" (mm_end),
"m" (mask3216g),
"m" (mask3216br),
"m" (mul3216)
320 __asm__
volatile(
SFENCE:::
"memory");
321 __asm__
volatile(
EMMS:::
"memory");
323 register int rgb = *(
const uint32_t*)s; s += 4;
324 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
333 uint16_t *d = (uint16_t *)dst;
335 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
337 "movq %0, %%mm7 \n\t"
338 "movq %1, %%mm6 \n\t"
339 ::
"m"(red_16mask),
"m"(green_16mask));
344 "movd (%1), %%mm0 \n\t"
345 "movd 4(%1), %%mm3 \n\t"
346 "punpckldq 8(%1), %%mm0 \n\t"
347 "punpckldq 12(%1), %%mm3 \n\t"
348 "movq %%mm0, %%mm1 \n\t"
349 "movq %%mm0, %%mm2 \n\t"
350 "movq %%mm3, %%mm4 \n\t"
351 "movq %%mm3, %%mm5 \n\t"
352 "psllq $8, %%mm0 \n\t"
353 "psllq $8, %%mm3 \n\t"
354 "pand %%mm7, %%mm0 \n\t"
355 "pand %%mm7, %%mm3 \n\t"
356 "psrlq $5, %%mm1 \n\t"
357 "psrlq $5, %%mm4 \n\t"
358 "pand %%mm6, %%mm1 \n\t"
359 "pand %%mm6, %%mm4 \n\t"
360 "psrlq $19, %%mm2 \n\t"
361 "psrlq $19, %%mm5 \n\t"
362 "pand %2, %%mm2 \n\t"
363 "pand %2, %%mm5 \n\t"
364 "por %%mm1, %%mm0 \n\t"
365 "por %%mm4, %%mm3 \n\t"
366 "por %%mm2, %%mm0 \n\t"
367 "por %%mm5, %%mm3 \n\t"
368 "psllq $16, %%mm3 \n\t"
369 "por %%mm3, %%mm0 \n\t"
371 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
375 __asm__
volatile(
SFENCE:::
"memory");
376 __asm__
volatile(
EMMS:::
"memory");
378 register int rgb = *(
const uint32_t*)s; s += 4;
379 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
388 uint16_t *d = (uint16_t *)dst;
392 "movq %3, %%mm5 \n\t"
393 "movq %4, %%mm6 \n\t"
394 "movq %5, %%mm7 \n\t"
399 "movd (%1), %%mm0 \n\t"
400 "movd 4(%1), %%mm3 \n\t"
401 "punpckldq 8(%1), %%mm0 \n\t"
402 "punpckldq 12(%1), %%mm3 \n\t"
403 "movq %%mm0, %%mm1 \n\t"
404 "movq %%mm3, %%mm4 \n\t"
405 "pand %%mm6, %%mm0 \n\t"
406 "pand %%mm6, %%mm3 \n\t"
407 "pmaddwd %%mm7, %%mm0 \n\t"
408 "pmaddwd %%mm7, %%mm3 \n\t"
409 "pand %%mm5, %%mm1 \n\t"
410 "pand %%mm5, %%mm4 \n\t"
411 "por %%mm1, %%mm0 \n\t"
412 "por %%mm4, %%mm3 \n\t"
413 "psrld $6, %%mm0 \n\t"
414 "pslld $10, %%mm3 \n\t"
415 "por %%mm3, %%mm0 \n\t"
423 :
"r" (mm_end),
"m" (mask3215g),
"m" (mask3216br),
"m" (mul3215)
425 __asm__
volatile(
SFENCE:::
"memory");
426 __asm__
volatile(
EMMS:::
"memory");
428 register int rgb = *(
const uint32_t*)s; s += 4;
429 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
438 uint16_t *d = (uint16_t *)dst;
440 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
442 "movq %0, %%mm7 \n\t"
443 "movq %1, %%mm6 \n\t"
444 ::
"m"(red_15mask),
"m"(green_15mask));
449 "movd (%1), %%mm0 \n\t"
450 "movd 4(%1), %%mm3 \n\t"
451 "punpckldq 8(%1), %%mm0 \n\t"
452 "punpckldq 12(%1), %%mm3 \n\t"
453 "movq %%mm0, %%mm1 \n\t"
454 "movq %%mm0, %%mm2 \n\t"
455 "movq %%mm3, %%mm4 \n\t"
456 "movq %%mm3, %%mm5 \n\t"
457 "psllq $7, %%mm0 \n\t"
458 "psllq $7, %%mm3 \n\t"
459 "pand %%mm7, %%mm0 \n\t"
460 "pand %%mm7, %%mm3 \n\t"
461 "psrlq $6, %%mm1 \n\t"
462 "psrlq $6, %%mm4 \n\t"
463 "pand %%mm6, %%mm1 \n\t"
464 "pand %%mm6, %%mm4 \n\t"
465 "psrlq $19, %%mm2 \n\t"
466 "psrlq $19, %%mm5 \n\t"
467 "pand %2, %%mm2 \n\t"
468 "pand %2, %%mm5 \n\t"
469 "por %%mm1, %%mm0 \n\t"
470 "por %%mm4, %%mm3 \n\t"
471 "por %%mm2, %%mm0 \n\t"
472 "por %%mm5, %%mm3 \n\t"
473 "psllq $16, %%mm3 \n\t"
474 "por %%mm3, %%mm0 \n\t"
476 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
480 __asm__
volatile(
SFENCE:::
"memory");
481 __asm__
volatile(
EMMS:::
"memory");
483 register int rgb = *(
const uint32_t*)s; s += 4;
484 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
493 uint16_t *d = (uint16_t *)dst;
495 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
497 "movq %0, %%mm7 \n\t"
498 "movq %1, %%mm6 \n\t"
499 ::
"m"(red_16mask),
"m"(green_16mask));
504 "movd (%1), %%mm0 \n\t"
505 "movd 3(%1), %%mm3 \n\t"
506 "punpckldq 6(%1), %%mm0 \n\t"
507 "punpckldq 9(%1), %%mm3 \n\t"
508 "movq %%mm0, %%mm1 \n\t"
509 "movq %%mm0, %%mm2 \n\t"
510 "movq %%mm3, %%mm4 \n\t"
511 "movq %%mm3, %%mm5 \n\t"
512 "psrlq $3, %%mm0 \n\t"
513 "psrlq $3, %%mm3 \n\t"
514 "pand %2, %%mm0 \n\t"
515 "pand %2, %%mm3 \n\t"
516 "psrlq $5, %%mm1 \n\t"
517 "psrlq $5, %%mm4 \n\t"
518 "pand %%mm6, %%mm1 \n\t"
519 "pand %%mm6, %%mm4 \n\t"
520 "psrlq $8, %%mm2 \n\t"
521 "psrlq $8, %%mm5 \n\t"
522 "pand %%mm7, %%mm2 \n\t"
523 "pand %%mm7, %%mm5 \n\t"
524 "por %%mm1, %%mm0 \n\t"
525 "por %%mm4, %%mm3 \n\t"
526 "por %%mm2, %%mm0 \n\t"
527 "por %%mm5, %%mm3 \n\t"
528 "psllq $16, %%mm3 \n\t"
529 "por %%mm3, %%mm0 \n\t"
531 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
535 __asm__
volatile(
SFENCE:::
"memory");
536 __asm__
volatile(
EMMS:::
"memory");
541 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
550 uint16_t *d = (uint16_t *)dst;
552 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
554 "movq %0, %%mm7 \n\t"
555 "movq %1, %%mm6 \n\t"
556 ::
"m"(red_16mask),
"m"(green_16mask));
561 "movd (%1), %%mm0 \n\t"
562 "movd 3(%1), %%mm3 \n\t"
563 "punpckldq 6(%1), %%mm0 \n\t"
564 "punpckldq 9(%1), %%mm3 \n\t"
565 "movq %%mm0, %%mm1 \n\t"
566 "movq %%mm0, %%mm2 \n\t"
567 "movq %%mm3, %%mm4 \n\t"
568 "movq %%mm3, %%mm5 \n\t"
569 "psllq $8, %%mm0 \n\t"
570 "psllq $8, %%mm3 \n\t"
571 "pand %%mm7, %%mm0 \n\t"
572 "pand %%mm7, %%mm3 \n\t"
573 "psrlq $5, %%mm1 \n\t"
574 "psrlq $5, %%mm4 \n\t"
575 "pand %%mm6, %%mm1 \n\t"
576 "pand %%mm6, %%mm4 \n\t"
577 "psrlq $19, %%mm2 \n\t"
578 "psrlq $19, %%mm5 \n\t"
579 "pand %2, %%mm2 \n\t"
580 "pand %2, %%mm5 \n\t"
581 "por %%mm1, %%mm0 \n\t"
582 "por %%mm4, %%mm3 \n\t"
583 "por %%mm2, %%mm0 \n\t"
584 "por %%mm5, %%mm3 \n\t"
585 "psllq $16, %%mm3 \n\t"
586 "por %%mm3, %%mm0 \n\t"
588 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
592 __asm__
volatile(
SFENCE:::
"memory");
593 __asm__
volatile(
EMMS:::
"memory");
598 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
607 uint16_t *d = (uint16_t *)dst;
609 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
611 "movq %0, %%mm7 \n\t"
612 "movq %1, %%mm6 \n\t"
613 ::
"m"(red_15mask),
"m"(green_15mask));
618 "movd (%1), %%mm0 \n\t"
619 "movd 3(%1), %%mm3 \n\t"
620 "punpckldq 6(%1), %%mm0 \n\t"
621 "punpckldq 9(%1), %%mm3 \n\t"
622 "movq %%mm0, %%mm1 \n\t"
623 "movq %%mm0, %%mm2 \n\t"
624 "movq %%mm3, %%mm4 \n\t"
625 "movq %%mm3, %%mm5 \n\t"
626 "psrlq $3, %%mm0 \n\t"
627 "psrlq $3, %%mm3 \n\t"
628 "pand %2, %%mm0 \n\t"
629 "pand %2, %%mm3 \n\t"
630 "psrlq $6, %%mm1 \n\t"
631 "psrlq $6, %%mm4 \n\t"
632 "pand %%mm6, %%mm1 \n\t"
633 "pand %%mm6, %%mm4 \n\t"
634 "psrlq $9, %%mm2 \n\t"
635 "psrlq $9, %%mm5 \n\t"
636 "pand %%mm7, %%mm2 \n\t"
637 "pand %%mm7, %%mm5 \n\t"
638 "por %%mm1, %%mm0 \n\t"
639 "por %%mm4, %%mm3 \n\t"
640 "por %%mm2, %%mm0 \n\t"
641 "por %%mm5, %%mm3 \n\t"
642 "psllq $16, %%mm3 \n\t"
643 "por %%mm3, %%mm0 \n\t"
645 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
649 __asm__
volatile(
SFENCE:::
"memory");
650 __asm__
volatile(
EMMS:::
"memory");
655 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
664 uint16_t *d = (uint16_t *)dst;
666 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
668 "movq %0, %%mm7 \n\t"
669 "movq %1, %%mm6 \n\t"
670 ::
"m"(red_15mask),
"m"(green_15mask));
675 "movd (%1), %%mm0 \n\t"
676 "movd 3(%1), %%mm3 \n\t"
677 "punpckldq 6(%1), %%mm0 \n\t"
678 "punpckldq 9(%1), %%mm3 \n\t"
679 "movq %%mm0, %%mm1 \n\t"
680 "movq %%mm0, %%mm2 \n\t"
681 "movq %%mm3, %%mm4 \n\t"
682 "movq %%mm3, %%mm5 \n\t"
683 "psllq $7, %%mm0 \n\t"
684 "psllq $7, %%mm3 \n\t"
685 "pand %%mm7, %%mm0 \n\t"
686 "pand %%mm7, %%mm3 \n\t"
687 "psrlq $6, %%mm1 \n\t"
688 "psrlq $6, %%mm4 \n\t"
689 "pand %%mm6, %%mm1 \n\t"
690 "pand %%mm6, %%mm4 \n\t"
691 "psrlq $19, %%mm2 \n\t"
692 "psrlq $19, %%mm5 \n\t"
693 "pand %2, %%mm2 \n\t"
694 "pand %2, %%mm5 \n\t"
695 "por %%mm1, %%mm0 \n\t"
696 "por %%mm4, %%mm3 \n\t"
697 "por %%mm2, %%mm0 \n\t"
698 "por %%mm5, %%mm3 \n\t"
699 "psllq $16, %%mm3 \n\t"
700 "por %%mm3, %%mm0 \n\t"
702 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
706 __asm__
volatile(
SFENCE:::
"memory");
707 __asm__
volatile(
EMMS:::
"memory");
712 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
740 const uint16_t *mm_end;
742 const uint16_t *s = (
const uint16_t*)src;
743 end = s + src_size/2;
744 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
749 "movq (%1), %%mm0 \n\t"
750 "movq (%1), %%mm1 \n\t"
751 "movq (%1), %%mm2 \n\t"
752 "pand %2, %%mm0 \n\t"
753 "pand %3, %%mm1 \n\t"
754 "pand %4, %%mm2 \n\t"
755 "psllq $3, %%mm0 \n\t"
756 "psrlq $2, %%mm1 \n\t"
757 "psrlq $7, %%mm2 \n\t"
758 "movq %%mm0, %%mm3 \n\t"
759 "movq %%mm1, %%mm4 \n\t"
760 "movq %%mm2, %%mm5 \n\t"
761 "punpcklwd %5, %%mm0 \n\t"
762 "punpcklwd %5, %%mm1 \n\t"
763 "punpcklwd %5, %%mm2 \n\t"
764 "punpckhwd %5, %%mm3 \n\t"
765 "punpckhwd %5, %%mm4 \n\t"
766 "punpckhwd %5, %%mm5 \n\t"
767 "psllq $8, %%mm1 \n\t"
768 "psllq $16, %%mm2 \n\t"
769 "por %%mm1, %%mm0 \n\t"
770 "por %%mm2, %%mm0 \n\t"
771 "psllq $8, %%mm4 \n\t"
772 "psllq $16, %%mm5 \n\t"
773 "por %%mm4, %%mm3 \n\t"
774 "por %%mm5, %%mm3 \n\t"
776 "movq %%mm0, %%mm6 \n\t"
777 "movq %%mm3, %%mm7 \n\t"
779 "movq 8(%1), %%mm0 \n\t"
780 "movq 8(%1), %%mm1 \n\t"
781 "movq 8(%1), %%mm2 \n\t"
782 "pand %2, %%mm0 \n\t"
783 "pand %3, %%mm1 \n\t"
784 "pand %4, %%mm2 \n\t"
785 "psllq $3, %%mm0 \n\t"
786 "psrlq $2, %%mm1 \n\t"
787 "psrlq $7, %%mm2 \n\t"
788 "movq %%mm0, %%mm3 \n\t"
789 "movq %%mm1, %%mm4 \n\t"
790 "movq %%mm2, %%mm5 \n\t"
791 "punpcklwd %5, %%mm0 \n\t"
792 "punpcklwd %5, %%mm1 \n\t"
793 "punpcklwd %5, %%mm2 \n\t"
794 "punpckhwd %5, %%mm3 \n\t"
795 "punpckhwd %5, %%mm4 \n\t"
796 "punpckhwd %5, %%mm5 \n\t"
797 "psllq $8, %%mm1 \n\t"
798 "psllq $16, %%mm2 \n\t"
799 "por %%mm1, %%mm0 \n\t"
800 "por %%mm2, %%mm0 \n\t"
801 "psllq $8, %%mm4 \n\t"
802 "psllq $16, %%mm5 \n\t"
803 "por %%mm4, %%mm3 \n\t"
804 "por %%mm5, %%mm3 \n\t"
807 :
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r),
"m"(mmx_null)
811 "movq %%mm0, %%mm4 \n\t"
812 "movq %%mm3, %%mm5 \n\t"
813 "movq %%mm6, %%mm0 \n\t"
814 "movq %%mm7, %%mm1 \n\t"
816 "movq %%mm4, %%mm6 \n\t"
817 "movq %%mm5, %%mm7 \n\t"
818 "movq %%mm0, %%mm2 \n\t"
819 "movq %%mm1, %%mm3 \n\t"
828 __asm__
volatile(
SFENCE:::
"memory");
829 __asm__
volatile(
EMMS:::
"memory");
831 register uint16_t bgr;
833 *d++ = (bgr&0x1F)<<3;
834 *d++ = (bgr&0x3E0)>>2;
835 *d++ = (bgr&0x7C00)>>7;
842 const uint16_t *mm_end;
844 const uint16_t *s = (
const uint16_t *)src;
845 end = s + src_size/2;
846 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
851 "movq (%1), %%mm0 \n\t"
852 "movq (%1), %%mm1 \n\t"
853 "movq (%1), %%mm2 \n\t"
854 "pand %2, %%mm0 \n\t"
855 "pand %3, %%mm1 \n\t"
856 "pand %4, %%mm2 \n\t"
857 "psllq $3, %%mm0 \n\t"
858 "psrlq $3, %%mm1 \n\t"
859 "psrlq $8, %%mm2 \n\t"
860 "movq %%mm0, %%mm3 \n\t"
861 "movq %%mm1, %%mm4 \n\t"
862 "movq %%mm2, %%mm5 \n\t"
863 "punpcklwd %5, %%mm0 \n\t"
864 "punpcklwd %5, %%mm1 \n\t"
865 "punpcklwd %5, %%mm2 \n\t"
866 "punpckhwd %5, %%mm3 \n\t"
867 "punpckhwd %5, %%mm4 \n\t"
868 "punpckhwd %5, %%mm5 \n\t"
869 "psllq $8, %%mm1 \n\t"
870 "psllq $16, %%mm2 \n\t"
871 "por %%mm1, %%mm0 \n\t"
872 "por %%mm2, %%mm0 \n\t"
873 "psllq $8, %%mm4 \n\t"
874 "psllq $16, %%mm5 \n\t"
875 "por %%mm4, %%mm3 \n\t"
876 "por %%mm5, %%mm3 \n\t"
878 "movq %%mm0, %%mm6 \n\t"
879 "movq %%mm3, %%mm7 \n\t"
881 "movq 8(%1), %%mm0 \n\t"
882 "movq 8(%1), %%mm1 \n\t"
883 "movq 8(%1), %%mm2 \n\t"
884 "pand %2, %%mm0 \n\t"
885 "pand %3, %%mm1 \n\t"
886 "pand %4, %%mm2 \n\t"
887 "psllq $3, %%mm0 \n\t"
888 "psrlq $3, %%mm1 \n\t"
889 "psrlq $8, %%mm2 \n\t"
890 "movq %%mm0, %%mm3 \n\t"
891 "movq %%mm1, %%mm4 \n\t"
892 "movq %%mm2, %%mm5 \n\t"
893 "punpcklwd %5, %%mm0 \n\t"
894 "punpcklwd %5, %%mm1 \n\t"
895 "punpcklwd %5, %%mm2 \n\t"
896 "punpckhwd %5, %%mm3 \n\t"
897 "punpckhwd %5, %%mm4 \n\t"
898 "punpckhwd %5, %%mm5 \n\t"
899 "psllq $8, %%mm1 \n\t"
900 "psllq $16, %%mm2 \n\t"
901 "por %%mm1, %%mm0 \n\t"
902 "por %%mm2, %%mm0 \n\t"
903 "psllq $8, %%mm4 \n\t"
904 "psllq $16, %%mm5 \n\t"
905 "por %%mm4, %%mm3 \n\t"
906 "por %%mm5, %%mm3 \n\t"
908 :
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mmx_null)
912 "movq %%mm0, %%mm4 \n\t"
913 "movq %%mm3, %%mm5 \n\t"
914 "movq %%mm6, %%mm0 \n\t"
915 "movq %%mm7, %%mm1 \n\t"
917 "movq %%mm4, %%mm6 \n\t"
918 "movq %%mm5, %%mm7 \n\t"
919 "movq %%mm0, %%mm2 \n\t"
920 "movq %%mm1, %%mm3 \n\t"
929 __asm__
volatile(
SFENCE:::
"memory");
930 __asm__
volatile(
EMMS:::
"memory");
932 register uint16_t bgr;
934 *d++ = (bgr&0x1F)<<3;
935 *d++ = (bgr&0x7E0)>>3;
936 *d++ = (bgr&0xF800)>>8;
948 "packuswb %%mm7, %%mm0 \n\t" \
949 "packuswb %%mm7, %%mm1 \n\t" \
950 "packuswb %%mm7, %%mm2 \n\t" \
951 "punpcklbw %%mm1, %%mm0 \n\t" \
952 "punpcklbw %%mm6, %%mm2 \n\t" \
953 "movq %%mm0, %%mm3 \n\t" \
954 "punpcklwd %%mm2, %%mm0 \n\t" \
955 "punpckhwd %%mm2, %%mm3 \n\t" \
956 MOVNTQ" %%mm0, (%0) \n\t" \
957 MOVNTQ" %%mm3, 8(%0) \n\t" \
962 const uint16_t *mm_end;
964 const uint16_t *s = (
const uint16_t *)src;
965 end = s + src_size/2;
966 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
967 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
968 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
973 "movq (%1), %%mm0 \n\t"
974 "movq (%1), %%mm1 \n\t"
975 "movq (%1), %%mm2 \n\t"
976 "pand %2, %%mm0 \n\t"
977 "pand %3, %%mm1 \n\t"
978 "pand %4, %%mm2 \n\t"
979 "psllq $3, %%mm0 \n\t"
980 "psrlq $2, %%mm1 \n\t"
981 "psrlq $7, %%mm2 \n\t"
983 ::
"r"(d),
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r)
988 __asm__
volatile(
SFENCE:::
"memory");
989 __asm__
volatile(
EMMS:::
"memory");
991 register uint16_t bgr;
993 *d++ = (bgr&0x1F)<<3;
994 *d++ = (bgr&0x3E0)>>2;
995 *d++ = (bgr&0x7C00)>>7;
1002 const uint16_t *end;
1003 const uint16_t *mm_end;
1005 const uint16_t *s = (
const uint16_t*)src;
1006 end = s + src_size/2;
1007 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
1008 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
1009 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
1011 while (s < mm_end) {
1014 "movq (%1), %%mm0 \n\t"
1015 "movq (%1), %%mm1 \n\t"
1016 "movq (%1), %%mm2 \n\t"
1017 "pand %2, %%mm0 \n\t"
1018 "pand %3, %%mm1 \n\t"
1019 "pand %4, %%mm2 \n\t"
1020 "psllq $3, %%mm0 \n\t"
1021 "psrlq $3, %%mm1 \n\t"
1022 "psrlq $8, %%mm2 \n\t"
1024 ::
"r"(d),
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r)
1029 __asm__
volatile(
SFENCE:::
"memory");
1030 __asm__
volatile(
EMMS:::
"memory");
1032 register uint16_t bgr;
1034 *d++ = (bgr&0x1F)<<3;
1035 *d++ = (bgr&0x7E0)>>3;
1036 *d++ = (bgr&0xF800)>>8;
1050 "movq %3, %%mm7 \n\t"
1051 "pxor %4, %%mm7 \n\t"
1052 "movq %%mm7, %%mm6 \n\t"
1053 "pxor %5, %%mm7 \n\t"
1057 "movq (%1, %0), %%mm0 \n\t"
1058 "movq 8(%1, %0), %%mm1 \n\t"
1059 # if COMPILE_TEMPLATE_MMXEXT
1060 "pshufw $177, %%mm0, %%mm3 \n\t"
1061 "pshufw $177, %%mm1, %%mm5 \n\t"
1062 "pand %%mm7, %%mm0 \n\t"
1063 "pand %%mm6, %%mm3 \n\t"
1064 "pand %%mm7, %%mm1 \n\t"
1065 "pand %%mm6, %%mm5 \n\t"
1066 "por %%mm3, %%mm0 \n\t"
1067 "por %%mm5, %%mm1 \n\t"
1069 "movq %%mm0, %%mm2 \n\t"
1070 "movq %%mm1, %%mm4 \n\t"
1071 "pand %%mm7, %%mm0 \n\t"
1072 "pand %%mm6, %%mm2 \n\t"
1073 "pand %%mm7, %%mm1 \n\t"
1074 "pand %%mm6, %%mm4 \n\t"
1075 "movq %%mm2, %%mm3 \n\t"
1076 "movq %%mm4, %%mm5 \n\t"
1077 "pslld $16, %%mm2 \n\t"
1078 "psrld $16, %%mm3 \n\t"
1079 "pslld $16, %%mm4 \n\t"
1080 "psrld $16, %%mm5 \n\t"
1081 "por %%mm2, %%mm0 \n\t"
1082 "por %%mm4, %%mm1 \n\t"
1083 "por %%mm3, %%mm0 \n\t"
1084 "por %%mm5, %%mm1 \n\t"
1086 MOVNTQ" %%mm0, (%2, %0) \n\t"
1087 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1094 :
"r" (s),
"r" (d),
"m" (mask32b),
"m" (mask32r),
"m" (mmx_one)
1096 for (; idx<15; idx+=4) {
1097 register int v = *(
const uint32_t *)&s[idx],
g = v & 0xff00ff00;
1099 *(uint32_t *)&d[idx] = (v>>16) +
g + (v<<16);
1106 x86_reg mmx_size= 23 - src_size;
1108 "test %%"REG_a
", %%"REG_a
" \n\t"
1110 "movq "MANGLE(mask24r)
", %%mm5 \n\t"
1111 "movq "MANGLE(mask24g)
", %%mm6 \n\t"
1112 "movq "MANGLE(mask24b)
", %%mm7 \n\t"
1116 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1117 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1118 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t"
1119 "psllq $16, %%mm0 \n\t"
1120 "pand %%mm5, %%mm0 \n\t"
1121 "pand %%mm6, %%mm1 \n\t"
1122 "pand %%mm7, %%mm2 \n\t"
1123 "por %%mm0, %%mm1 \n\t"
1124 "por %%mm2, %%mm1 \n\t"
1125 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t"
1126 MOVNTQ" %%mm1, (%2, %%"REG_a
") \n\t"
1127 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t"
1128 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t"
1129 "pand %%mm7, %%mm0 \n\t"
1130 "pand %%mm5, %%mm1 \n\t"
1131 "pand %%mm6, %%mm2 \n\t"
1132 "por %%mm0, %%mm1 \n\t"
1133 "por %%mm2, %%mm1 \n\t"
1134 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t"
1135 MOVNTQ" %%mm1, 8(%2, %%"REG_a
") \n\t"
1136 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t"
1137 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t"
1138 "pand %%mm6, %%mm0 \n\t"
1139 "pand %%mm7, %%mm1 \n\t"
1140 "pand %%mm5, %%mm2 \n\t"
1141 "por %%mm0, %%mm1 \n\t"
1142 "por %%mm2, %%mm1 \n\t"
1143 MOVNTQ" %%mm1, 16(%2, %%"REG_a
") \n\t"
1144 "add $24, %%"REG_a
" \n\t"
1148 :
"r" (src-mmx_size),
"r"(dst-mmx_size)
1151 __asm__
volatile(
SFENCE:::
"memory");
1152 __asm__
volatile(
EMMS:::
"memory");
1154 if (mmx_size==23)
return;
1158 src_size= 23-mmx_size;
1161 for (i=0; i<src_size; i+=3) {
1164 dst[i + 1] = src[i + 1];
1165 dst[i + 2] = src[i + 0];
1172 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1176 for (y=0; y<
height; y++) {
1179 "xor %%"REG_a
", %%"REG_a
" \n\t"
1182 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1185 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1186 "movq %%mm0, %%mm2 \n\t"
1187 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1188 "punpcklbw %%mm1, %%mm0 \n\t"
1189 "punpckhbw %%mm1, %%mm2 \n\t"
1191 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1192 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1193 "movq %%mm3, %%mm4 \n\t"
1194 "movq %%mm5, %%mm6 \n\t"
1195 "punpcklbw %%mm0, %%mm3 \n\t"
1196 "punpckhbw %%mm0, %%mm4 \n\t"
1197 "punpcklbw %%mm2, %%mm5 \n\t"
1198 "punpckhbw %%mm2, %%mm6 \n\t"
1200 MOVNTQ" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1201 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1202 MOVNTQ" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1203 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1205 "add $8, %%"REG_a
" \n\t"
1206 "cmp %4, %%"REG_a
" \n\t"
1208 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1211 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1212 usrc += chromStride;
1213 vsrc += chromStride;
1229 int lumStride,
int chromStride,
int dstStride)
1237 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1241 for (y=0; y<
height; y++) {
1244 "xor %%"REG_a
", %%"REG_a
" \n\t"
1247 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1250 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1251 "movq %%mm0, %%mm2 \n\t"
1252 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1253 "punpcklbw %%mm1, %%mm0 \n\t"
1254 "punpckhbw %%mm1, %%mm2 \n\t"
1256 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1257 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1258 "movq %%mm0, %%mm4 \n\t"
1259 "movq %%mm2, %%mm6 \n\t"
1260 "punpcklbw %%mm3, %%mm0 \n\t"
1261 "punpckhbw %%mm3, %%mm4 \n\t"
1262 "punpcklbw %%mm5, %%mm2 \n\t"
1263 "punpckhbw %%mm5, %%mm6 \n\t"
1265 MOVNTQ" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1266 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1267 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1268 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1270 "add $8, %%"REG_a
" \n\t"
1271 "cmp %4, %%"REG_a
" \n\t"
1273 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1276 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1277 usrc += chromStride;
1278 vsrc += chromStride;
1294 int lumStride,
int chromStride,
int dstStride)
1305 int lumStride,
int chromStride,
int dstStride)
1315 int lumStride,
int chromStride,
int dstStride)
1326 int lumStride,
int chromStride,
int srcStride)
1330 for (y=0; y<
height; y+=2) {
1332 "xor %%"REG_a
", %%"REG_a
" \n\t"
1333 "pcmpeqw %%mm7, %%mm7 \n\t"
1334 "psrlw $8, %%mm7 \n\t"
1337 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1338 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1339 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1340 "movq %%mm0, %%mm2 \n\t"
1341 "movq %%mm1, %%mm3 \n\t"
1342 "psrlw $8, %%mm0 \n\t"
1343 "psrlw $8, %%mm1 \n\t"
1344 "pand %%mm7, %%mm2 \n\t"
1345 "pand %%mm7, %%mm3 \n\t"
1346 "packuswb %%mm1, %%mm0 \n\t"
1347 "packuswb %%mm3, %%mm2 \n\t"
1349 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1351 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1352 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1353 "movq %%mm1, %%mm3 \n\t"
1354 "movq %%mm2, %%mm4 \n\t"
1355 "psrlw $8, %%mm1 \n\t"
1356 "psrlw $8, %%mm2 \n\t"
1357 "pand %%mm7, %%mm3 \n\t"
1358 "pand %%mm7, %%mm4 \n\t"
1359 "packuswb %%mm2, %%mm1 \n\t"
1360 "packuswb %%mm4, %%mm3 \n\t"
1362 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1364 "movq %%mm0, %%mm2 \n\t"
1365 "movq %%mm1, %%mm3 \n\t"
1366 "psrlw $8, %%mm0 \n\t"
1367 "psrlw $8, %%mm1 \n\t"
1368 "pand %%mm7, %%mm2 \n\t"
1369 "pand %%mm7, %%mm3 \n\t"
1370 "packuswb %%mm1, %%mm0 \n\t"
1371 "packuswb %%mm3, %%mm2 \n\t"
1373 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1374 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1376 "add $8, %%"REG_a
" \n\t"
1377 "cmp %4, %%"REG_a
" \n\t"
1379 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1380 :
"memory",
"%"REG_a
1387 "xor %%"REG_a
", %%"REG_a
" \n\t"
1390 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1391 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1392 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1393 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1394 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1395 "pand %%mm7, %%mm0 \n\t"
1396 "pand %%mm7, %%mm1 \n\t"
1397 "pand %%mm7, %%mm2 \n\t"
1398 "pand %%mm7, %%mm3 \n\t"
1399 "packuswb %%mm1, %%mm0 \n\t"
1400 "packuswb %%mm3, %%mm2 \n\t"
1402 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1403 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1405 "add $8, %%"REG_a
" \n\t"
1406 "cmp %4, %%"REG_a
" \n\t"
1409 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1410 :
"memory",
"%"REG_a
1412 udst += chromStride;
1413 vdst += chromStride;
1417 __asm__
volatile(
EMMS" \n\t"
1423 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1431 for (x=0; x<srcWidth-1; x++) {
1432 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1433 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1435 dst[2*srcWidth-1]= src[srcWidth-1];
1439 for (y=1; y<srcHeight; y++) {
1440 const x86_reg mmxSize= srcWidth&~15;
1442 "mov %4, %%"REG_a
" \n\t"
1443 "movq "MANGLE(mmx_ff)
", %%mm0 \n\t"
1444 "movq (%0, %%"REG_a
"), %%mm4 \n\t"
1445 "movq %%mm4, %%mm2 \n\t"
1446 "psllq $8, %%mm4 \n\t"
1447 "pand %%mm0, %%mm2 \n\t"
1448 "por %%mm2, %%mm4 \n\t"
1449 "movq (%1, %%"REG_a
"), %%mm5 \n\t"
1450 "movq %%mm5, %%mm3 \n\t"
1451 "psllq $8, %%mm5 \n\t"
1452 "pand %%mm0, %%mm3 \n\t"
1453 "por %%mm3, %%mm5 \n\t"
1455 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1456 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1457 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1458 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1459 PAVGB
" %%mm0, %%mm5 \n\t"
1460 PAVGB
" %%mm0, %%mm3 \n\t"
1461 PAVGB
" %%mm0, %%mm5 \n\t"
1462 PAVGB
" %%mm0, %%mm3 \n\t"
1463 PAVGB
" %%mm1, %%mm4 \n\t"
1464 PAVGB
" %%mm1, %%mm2 \n\t"
1465 PAVGB
" %%mm1, %%mm4 \n\t"
1466 PAVGB
" %%mm1, %%mm2 \n\t"
1467 "movq %%mm5, %%mm7 \n\t"
1468 "movq %%mm4, %%mm6 \n\t"
1469 "punpcklbw %%mm3, %%mm5 \n\t"
1470 "punpckhbw %%mm3, %%mm7 \n\t"
1471 "punpcklbw %%mm2, %%mm4 \n\t"
1472 "punpckhbw %%mm2, %%mm6 \n\t"
1473 MOVNTQ" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1474 MOVNTQ" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1475 MOVNTQ" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1476 MOVNTQ" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1477 "add $8, %%"REG_a
" \n\t"
1478 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1479 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1481 ::
"r" (src + mmxSize ),
"r" (src + srcStride + mmxSize ),
1482 "r" (dst + mmxSize*2),
"r" (dst + dstStride + mmxSize*2),
1487 for (x=mmxSize-1; x<srcWidth-1; x++) {
1488 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1489 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1490 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1491 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1493 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1494 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1503 for (x=0; x<srcWidth-1; x++) {
1504 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1505 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1507 dst[2*srcWidth-1]= src[srcWidth-1];
1509 __asm__
volatile(
EMMS" \n\t"
1515 #if !COMPILE_TEMPLATE_AMD3DNOW
1524 int lumStride,
int chromStride,
int srcStride)
1527 const x86_reg chromWidth= width>>1;
1528 for (y=0; y<
height; y+=2) {
1530 "xor %%"REG_a
", %%"REG_a
" \n\t"
1531 "pcmpeqw %%mm7, %%mm7 \n\t"
1532 "psrlw $8, %%mm7 \n\t"
1535 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1536 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1537 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1538 "movq %%mm0, %%mm2 \n\t"
1539 "movq %%mm1, %%mm3 \n\t"
1540 "pand %%mm7, %%mm0 \n\t"
1541 "pand %%mm7, %%mm1 \n\t"
1542 "psrlw $8, %%mm2 \n\t"
1543 "psrlw $8, %%mm3 \n\t"
1544 "packuswb %%mm1, %%mm0 \n\t"
1545 "packuswb %%mm3, %%mm2 \n\t"
1547 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1549 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1550 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1551 "movq %%mm1, %%mm3 \n\t"
1552 "movq %%mm2, %%mm4 \n\t"
1553 "pand %%mm7, %%mm1 \n\t"
1554 "pand %%mm7, %%mm2 \n\t"
1555 "psrlw $8, %%mm3 \n\t"
1556 "psrlw $8, %%mm4 \n\t"
1557 "packuswb %%mm2, %%mm1 \n\t"
1558 "packuswb %%mm4, %%mm3 \n\t"
1560 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1562 "movq %%mm0, %%mm2 \n\t"
1563 "movq %%mm1, %%mm3 \n\t"
1564 "psrlw $8, %%mm0 \n\t"
1565 "psrlw $8, %%mm1 \n\t"
1566 "pand %%mm7, %%mm2 \n\t"
1567 "pand %%mm7, %%mm3 \n\t"
1568 "packuswb %%mm1, %%mm0 \n\t"
1569 "packuswb %%mm3, %%mm2 \n\t"
1571 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1572 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1574 "add $8, %%"REG_a
" \n\t"
1575 "cmp %4, %%"REG_a
" \n\t"
1577 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1578 :
"memory",
"%"REG_a
1585 "xor %%"REG_a
", %%"REG_a
" \n\t"
1588 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1589 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1590 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1591 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1592 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1593 "psrlw $8, %%mm0 \n\t"
1594 "psrlw $8, %%mm1 \n\t"
1595 "psrlw $8, %%mm2 \n\t"
1596 "psrlw $8, %%mm3 \n\t"
1597 "packuswb %%mm1, %%mm0 \n\t"
1598 "packuswb %%mm3, %%mm2 \n\t"
1600 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1601 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1603 "add $8, %%"REG_a
" \n\t"
1604 "cmp %4, %%"REG_a
" \n\t"
1607 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1608 :
"memory",
"%"REG_a
1610 udst += chromStride;
1611 vdst += chromStride;
1615 __asm__
volatile(
EMMS" \n\t"
1630 int lumStride,
int chromStride,
int srcStride)
1633 const x86_reg chromWidth= width>>1;
1634 for (y=0; y<height-2; y+=2) {
1636 for (i=0; i<2; i++) {
1638 "mov %2, %%"REG_a
" \n\t"
1639 "movq "MANGLE(ff_bgr2YCoeff)
", %%mm6 \n\t"
1640 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1641 "pxor %%mm7, %%mm7 \n\t"
1642 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1646 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1647 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
1648 "punpcklbw %%mm7, %%mm0 \n\t"
1649 "punpcklbw %%mm7, %%mm1 \n\t"
1650 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
1651 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
1652 "punpcklbw %%mm7, %%mm2 \n\t"
1653 "punpcklbw %%mm7, %%mm3 \n\t"
1654 "pmaddwd %%mm6, %%mm0 \n\t"
1655 "pmaddwd %%mm6, %%mm1 \n\t"
1656 "pmaddwd %%mm6, %%mm2 \n\t"
1657 "pmaddwd %%mm6, %%mm3 \n\t"
1658 #ifndef FAST_BGR2YV12
1659 "psrad $8, %%mm0 \n\t"
1660 "psrad $8, %%mm1 \n\t"
1661 "psrad $8, %%mm2 \n\t"
1662 "psrad $8, %%mm3 \n\t"
1664 "packssdw %%mm1, %%mm0 \n\t"
1665 "packssdw %%mm3, %%mm2 \n\t"
1666 "pmaddwd %%mm5, %%mm0 \n\t"
1667 "pmaddwd %%mm5, %%mm2 \n\t"
1668 "packssdw %%mm2, %%mm0 \n\t"
1669 "psraw $7, %%mm0 \n\t"
1671 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1672 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
1673 "punpcklbw %%mm7, %%mm4 \n\t"
1674 "punpcklbw %%mm7, %%mm1 \n\t"
1675 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
1676 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
1677 "punpcklbw %%mm7, %%mm2 \n\t"
1678 "punpcklbw %%mm7, %%mm3 \n\t"
1679 "pmaddwd %%mm6, %%mm4 \n\t"
1680 "pmaddwd %%mm6, %%mm1 \n\t"
1681 "pmaddwd %%mm6, %%mm2 \n\t"
1682 "pmaddwd %%mm6, %%mm3 \n\t"
1683 #ifndef FAST_BGR2YV12
1684 "psrad $8, %%mm4 \n\t"
1685 "psrad $8, %%mm1 \n\t"
1686 "psrad $8, %%mm2 \n\t"
1687 "psrad $8, %%mm3 \n\t"
1689 "packssdw %%mm1, %%mm4 \n\t"
1690 "packssdw %%mm3, %%mm2 \n\t"
1691 "pmaddwd %%mm5, %%mm4 \n\t"
1692 "pmaddwd %%mm5, %%mm2 \n\t"
1693 "add $24, %%"REG_d
" \n\t"
1694 "packssdw %%mm2, %%mm4 \n\t"
1695 "psraw $7, %%mm4 \n\t"
1697 "packuswb %%mm4, %%mm0 \n\t"
1698 "paddusb "MANGLE(ff_bgr2YOffset)
", %%mm0 \n\t"
1700 MOVNTQ" %%mm0, (%1, %%"REG_a
") \n\t"
1701 "add $8, %%"REG_a
" \n\t"
1703 : :
"r" (src+width*3),
"r" (ydst+width),
"g" ((
x86_reg)-width)
1704 :
"%"REG_a,
"%"REG_d
1711 "mov %4, %%"REG_a
" \n\t"
1712 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1713 "movq "MANGLE(ff_bgr2UCoeff)
", %%mm6 \n\t"
1714 "pxor %%mm7, %%mm7 \n\t"
1715 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1716 "add %%"REG_d
", %%"REG_d
" \n\t"
1721 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1722 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
1723 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
1724 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
1725 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
1726 PAVGB
" %%mm1, %%mm0 \n\t"
1727 PAVGB
" %%mm3, %%mm2 \n\t"
1728 "movq %%mm0, %%mm1 \n\t"
1729 "movq %%mm2, %%mm3 \n\t"
1730 "psrlq $24, %%mm0 \n\t"
1731 "psrlq $24, %%mm2 \n\t"
1732 PAVGB
" %%mm1, %%mm0 \n\t"
1733 PAVGB
" %%mm3, %%mm2 \n\t"
1734 "punpcklbw %%mm7, %%mm0 \n\t"
1735 "punpcklbw %%mm7, %%mm2 \n\t"
1737 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1738 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
1739 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
1740 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
1741 "punpcklbw %%mm7, %%mm0 \n\t"
1742 "punpcklbw %%mm7, %%mm1 \n\t"
1743 "punpcklbw %%mm7, %%mm2 \n\t"
1744 "punpcklbw %%mm7, %%mm3 \n\t"
1745 "paddw %%mm1, %%mm0 \n\t"
1746 "paddw %%mm3, %%mm2 \n\t"
1747 "paddw %%mm2, %%mm0 \n\t"
1748 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
1749 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
1750 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
1751 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
1752 "punpcklbw %%mm7, %%mm4 \n\t"
1753 "punpcklbw %%mm7, %%mm1 \n\t"
1754 "punpcklbw %%mm7, %%mm2 \n\t"
1755 "punpcklbw %%mm7, %%mm3 \n\t"
1756 "paddw %%mm1, %%mm4 \n\t"
1757 "paddw %%mm3, %%mm2 \n\t"
1758 "paddw %%mm4, %%mm2 \n\t"
1759 "psrlw $2, %%mm0 \n\t"
1760 "psrlw $2, %%mm2 \n\t"
1762 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm1 \n\t"
1763 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm3 \n\t"
1765 "pmaddwd %%mm0, %%mm1 \n\t"
1766 "pmaddwd %%mm2, %%mm3 \n\t"
1767 "pmaddwd %%mm6, %%mm0 \n\t"
1768 "pmaddwd %%mm6, %%mm2 \n\t"
1769 #ifndef FAST_BGR2YV12
1770 "psrad $8, %%mm0 \n\t"
1771 "psrad $8, %%mm1 \n\t"
1772 "psrad $8, %%mm2 \n\t"
1773 "psrad $8, %%mm3 \n\t"
1775 "packssdw %%mm2, %%mm0 \n\t"
1776 "packssdw %%mm3, %%mm1 \n\t"
1777 "pmaddwd %%mm5, %%mm0 \n\t"
1778 "pmaddwd %%mm5, %%mm1 \n\t"
1779 "packssdw %%mm1, %%mm0 \n\t"
1780 "psraw $7, %%mm0 \n\t"
1782 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1783 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
1784 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
1785 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
1786 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
1787 PAVGB
" %%mm1, %%mm4 \n\t"
1788 PAVGB
" %%mm3, %%mm2 \n\t"
1789 "movq %%mm4, %%mm1 \n\t"
1790 "movq %%mm2, %%mm3 \n\t"
1791 "psrlq $24, %%mm4 \n\t"
1792 "psrlq $24, %%mm2 \n\t"
1793 PAVGB
" %%mm1, %%mm4 \n\t"
1794 PAVGB
" %%mm3, %%mm2 \n\t"
1795 "punpcklbw %%mm7, %%mm4 \n\t"
1796 "punpcklbw %%mm7, %%mm2 \n\t"
1798 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1799 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
1800 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
1801 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
1802 "punpcklbw %%mm7, %%mm4 \n\t"
1803 "punpcklbw %%mm7, %%mm1 \n\t"
1804 "punpcklbw %%mm7, %%mm2 \n\t"
1805 "punpcklbw %%mm7, %%mm3 \n\t"
1806 "paddw %%mm1, %%mm4 \n\t"
1807 "paddw %%mm3, %%mm2 \n\t"
1808 "paddw %%mm2, %%mm4 \n\t"
1809 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
1810 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
1811 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
1812 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
1813 "punpcklbw %%mm7, %%mm5 \n\t"
1814 "punpcklbw %%mm7, %%mm1 \n\t"
1815 "punpcklbw %%mm7, %%mm2 \n\t"
1816 "punpcklbw %%mm7, %%mm3 \n\t"
1817 "paddw %%mm1, %%mm5 \n\t"
1818 "paddw %%mm3, %%mm2 \n\t"
1819 "paddw %%mm5, %%mm2 \n\t"
1820 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1821 "psrlw $2, %%mm4 \n\t"
1822 "psrlw $2, %%mm2 \n\t"
1824 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm1 \n\t"
1825 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm3 \n\t"
1827 "pmaddwd %%mm4, %%mm1 \n\t"
1828 "pmaddwd %%mm2, %%mm3 \n\t"
1829 "pmaddwd %%mm6, %%mm4 \n\t"
1830 "pmaddwd %%mm6, %%mm2 \n\t"
1831 #ifndef FAST_BGR2YV12
1832 "psrad $8, %%mm4 \n\t"
1833 "psrad $8, %%mm1 \n\t"
1834 "psrad $8, %%mm2 \n\t"
1835 "psrad $8, %%mm3 \n\t"
1837 "packssdw %%mm2, %%mm4 \n\t"
1838 "packssdw %%mm3, %%mm1 \n\t"
1839 "pmaddwd %%mm5, %%mm4 \n\t"
1840 "pmaddwd %%mm5, %%mm1 \n\t"
1841 "add $24, %%"REG_d
" \n\t"
1842 "packssdw %%mm1, %%mm4 \n\t"
1843 "psraw $7, %%mm4 \n\t"
1845 "movq %%mm0, %%mm1 \n\t"
1846 "punpckldq %%mm4, %%mm0 \n\t"
1847 "punpckhdq %%mm4, %%mm1 \n\t"
1848 "packsswb %%mm1, %%mm0 \n\t"
1849 "paddb "MANGLE(ff_bgr2UVOffset)
", %%mm0 \n\t"
1850 "movd %%mm0, (%2, %%"REG_a
") \n\t"
1851 "punpckhdq %%mm0, %%mm0 \n\t"
1852 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1853 "add $4, %%"REG_a
" \n\t"
1855 : :
"r" (src+chromWidth*6),
"r" (src+srcStride+chromWidth*6),
"r" (udst+chromWidth),
"r" (vdst+chromWidth),
"g" (-chromWidth)
1856 :
"%"REG_a,
"%"REG_d
1859 udst += chromStride;
1860 vdst += chromStride;
1864 __asm__
volatile(
EMMS" \n\t"
1868 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1872 #if !COMPILE_TEMPLATE_AMD3DNOW
1875 int src2Stride,
int dstStride)
1879 for (h=0; h <
height; h++) {
1882 #if COMPILE_TEMPLATE_SSE2
1884 "xor %%"REG_a
", %%"REG_a
" \n\t"
1888 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
1889 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
1890 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
1891 "punpcklbw %%xmm2, %%xmm0 \n\t"
1892 "punpckhbw %%xmm2, %%xmm1 \n\t"
1893 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
1894 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
1895 "add $16, %%"REG_a
" \n\t"
1896 "cmp %3, %%"REG_a
" \n\t"
1898 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1899 :
"memory",
"%"REG_a
""
1903 "xor %%"REG_a
", %%"REG_a
" \n\t"
1907 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1908 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
1909 "movq %%mm0, %%mm1 \n\t"
1910 "movq %%mm2, %%mm3 \n\t"
1911 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
1912 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
1913 "punpcklbw %%mm4, %%mm0 \n\t"
1914 "punpckhbw %%mm4, %%mm1 \n\t"
1915 "punpcklbw %%mm5, %%mm2 \n\t"
1916 "punpckhbw %%mm5, %%mm3 \n\t"
1917 MOVNTQ" %%mm0, (%0, %%"REG_a
", 2) \n\t"
1918 MOVNTQ" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
1919 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
1920 MOVNTQ" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
1921 "add $16, %%"REG_a
" \n\t"
1922 "cmp %3, %%"REG_a
" \n\t"
1924 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1925 :
"memory",
"%"REG_a
1928 for (w= (width&(~15)); w <
width; w++) {
1929 dest[2*w+0] = src1[w];
1930 dest[2*w+1] = src2[w];
1944 #if !COMPILE_TEMPLATE_SSE2
1945 #if !COMPILE_TEMPLATE_AMD3DNOW
1949 int srcStride1,
int srcStride2,
1950 int dstStride1,
int dstStride2)
1954 w=width/2; h=height/2;
1958 ::
"m"(*(src1+srcStride1)),
"m"(*(src2+srcStride2)):
"memory");
1960 const uint8_t*
s1=src1+srcStride1*(y>>1);
1963 for (;x<w-31;x+=32) {
1966 "movq (%1,%2), %%mm0 \n\t"
1967 "movq 8(%1,%2), %%mm2 \n\t"
1968 "movq 16(%1,%2), %%mm4 \n\t"
1969 "movq 24(%1,%2), %%mm6 \n\t"
1970 "movq %%mm0, %%mm1 \n\t"
1971 "movq %%mm2, %%mm3 \n\t"
1972 "movq %%mm4, %%mm5 \n\t"
1973 "movq %%mm6, %%mm7 \n\t"
1974 "punpcklbw %%mm0, %%mm0 \n\t"
1975 "punpckhbw %%mm1, %%mm1 \n\t"
1976 "punpcklbw %%mm2, %%mm2 \n\t"
1977 "punpckhbw %%mm3, %%mm3 \n\t"
1978 "punpcklbw %%mm4, %%mm4 \n\t"
1979 "punpckhbw %%mm5, %%mm5 \n\t"
1980 "punpcklbw %%mm6, %%mm6 \n\t"
1981 "punpckhbw %%mm7, %%mm7 \n\t"
1982 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1983 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1984 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1985 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1986 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1987 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1988 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1989 MOVNTQ" %%mm7, 56(%0,%2,2)"
1990 ::
"r"(d),
"r"(s1),
"r"(x)
1993 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1996 const uint8_t*
s2=src2+srcStride2*(y>>1);
1999 for (;x<w-31;x+=32) {
2002 "movq (%1,%2), %%mm0 \n\t"
2003 "movq 8(%1,%2), %%mm2 \n\t"
2004 "movq 16(%1,%2), %%mm4 \n\t"
2005 "movq 24(%1,%2), %%mm6 \n\t"
2006 "movq %%mm0, %%mm1 \n\t"
2007 "movq %%mm2, %%mm3 \n\t"
2008 "movq %%mm4, %%mm5 \n\t"
2009 "movq %%mm6, %%mm7 \n\t"
2010 "punpcklbw %%mm0, %%mm0 \n\t"
2011 "punpckhbw %%mm1, %%mm1 \n\t"
2012 "punpcklbw %%mm2, %%mm2 \n\t"
2013 "punpckhbw %%mm3, %%mm3 \n\t"
2014 "punpcklbw %%mm4, %%mm4 \n\t"
2015 "punpckhbw %%mm5, %%mm5 \n\t"
2016 "punpcklbw %%mm6, %%mm6 \n\t"
2017 "punpckhbw %%mm7, %%mm7 \n\t"
2018 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2019 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2020 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2021 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2022 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2023 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2024 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2025 MOVNTQ" %%mm7, 56(%0,%2,2)"
2026 ::
"r"(d),
"r"(s2),
"r"(x)
2029 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2041 int srcStride1,
int srcStride2,
2042 int srcStride3,
int dstStride)
2048 const uint8_t* yp=src1+srcStride1*y;
2049 const uint8_t* up=src2+srcStride2*(y>>2);
2050 const uint8_t* vp=src3+srcStride3*(y>>2);
2058 "movq (%1, %0, 4), %%mm0 \n\t"
2059 "movq (%2, %0), %%mm1 \n\t"
2060 "movq (%3, %0), %%mm2 \n\t"
2061 "movq %%mm0, %%mm3 \n\t"
2062 "movq %%mm1, %%mm4 \n\t"
2063 "movq %%mm2, %%mm5 \n\t"
2064 "punpcklbw %%mm1, %%mm1 \n\t"
2065 "punpcklbw %%mm2, %%mm2 \n\t"
2066 "punpckhbw %%mm4, %%mm4 \n\t"
2067 "punpckhbw %%mm5, %%mm5 \n\t"
2069 "movq %%mm1, %%mm6 \n\t"
2070 "punpcklbw %%mm2, %%mm1 \n\t"
2071 "punpcklbw %%mm1, %%mm0 \n\t"
2072 "punpckhbw %%mm1, %%mm3 \n\t"
2073 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2074 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2076 "punpckhbw %%mm2, %%mm6 \n\t"
2077 "movq 8(%1, %0, 4), %%mm0 \n\t"
2078 "movq %%mm0, %%mm3 \n\t"
2079 "punpcklbw %%mm6, %%mm0 \n\t"
2080 "punpckhbw %%mm6, %%mm3 \n\t"
2081 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2082 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2084 "movq %%mm4, %%mm6 \n\t"
2085 "movq 16(%1, %0, 4), %%mm0 \n\t"
2086 "movq %%mm0, %%mm3 \n\t"
2087 "punpcklbw %%mm5, %%mm4 \n\t"
2088 "punpcklbw %%mm4, %%mm0 \n\t"
2089 "punpckhbw %%mm4, %%mm3 \n\t"
2090 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2091 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2093 "punpckhbw %%mm5, %%mm6 \n\t"
2094 "movq 24(%1, %0, 4), %%mm0 \n\t"
2095 "movq %%mm0, %%mm3 \n\t"
2096 "punpcklbw %%mm6, %%mm0 \n\t"
2097 "punpckhbw %%mm6, %%mm3 \n\t"
2098 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2099 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2102 :
"r"(yp),
"r" (up),
"r"(vp),
"r"(d)
2106 const int x2 = x<<2;
2109 d[8*x+2] = yp[x2+1];
2111 d[8*x+4] = yp[x2+2];
2113 d[8*x+6] = yp[x2+3];
2134 "pcmpeqw %%mm7, %%mm7 \n\t"
2135 "psrlw $8, %%mm7 \n\t"
2137 "movq -30(%1, %0, 2), %%mm0 \n\t"
2138 "movq -22(%1, %0, 2), %%mm1 \n\t"
2139 "movq -14(%1, %0, 2), %%mm2 \n\t"
2140 "movq -6(%1, %0, 2), %%mm3 \n\t"
2141 "pand %%mm7, %%mm0 \n\t"
2142 "pand %%mm7, %%mm1 \n\t"
2143 "pand %%mm7, %%mm2 \n\t"
2144 "pand %%mm7, %%mm3 \n\t"
2145 "packuswb %%mm1, %%mm0 \n\t"
2146 "packuswb %%mm3, %%mm2 \n\t"
2147 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2148 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2152 :
"r"(src),
"r"(dst)
2157 dst[count]= src[2*count];
2162 #if !COMPILE_TEMPLATE_AMD3DNOW
2172 "pcmpeqw %%mm7, %%mm7 \n\t"
2173 "psrlw $8, %%mm7 \n\t"
2175 "movq -28(%1, %0, 4), %%mm0 \n\t"
2176 "movq -20(%1, %0, 4), %%mm1 \n\t"
2177 "movq -12(%1, %0, 4), %%mm2 \n\t"
2178 "movq -4(%1, %0, 4), %%mm3 \n\t"
2179 "pand %%mm7, %%mm0 \n\t"
2180 "pand %%mm7, %%mm1 \n\t"
2181 "pand %%mm7, %%mm2 \n\t"
2182 "pand %%mm7, %%mm3 \n\t"
2183 "packuswb %%mm1, %%mm0 \n\t"
2184 "packuswb %%mm3, %%mm2 \n\t"
2185 "movq %%mm0, %%mm1 \n\t"
2186 "movq %%mm2, %%mm3 \n\t"
2187 "psrlw $8, %%mm0 \n\t"
2188 "psrlw $8, %%mm2 \n\t"
2189 "pand %%mm7, %%mm1 \n\t"
2190 "pand %%mm7, %%mm3 \n\t"
2191 "packuswb %%mm2, %%mm0 \n\t"
2192 "packuswb %%mm3, %%mm1 \n\t"
2193 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2194 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2198 :
"r"(src),
"r"(dst0),
"r"(dst1)
2203 dst0[count]= src[4*count+0];
2204 dst1[count]= src[4*count+2];
2221 "pcmpeqw %%mm7, %%mm7 \n\t"
2222 "psrlw $8, %%mm7 \n\t"
2224 "movq -28(%1, %0, 4), %%mm0 \n\t"
2225 "movq -20(%1, %0, 4), %%mm1 \n\t"
2226 "movq -12(%1, %0, 4), %%mm2 \n\t"
2227 "movq -4(%1, %0, 4), %%mm3 \n\t"
2228 PAVGB
" -28(%2, %0, 4), %%mm0 \n\t"
2229 PAVGB
" -20(%2, %0, 4), %%mm1 \n\t"
2230 PAVGB
" -12(%2, %0, 4), %%mm2 \n\t"
2231 PAVGB
" - 4(%2, %0, 4), %%mm3 \n\t"
2232 "pand %%mm7, %%mm0 \n\t"
2233 "pand %%mm7, %%mm1 \n\t"
2234 "pand %%mm7, %%mm2 \n\t"
2235 "pand %%mm7, %%mm3 \n\t"
2236 "packuswb %%mm1, %%mm0 \n\t"
2237 "packuswb %%mm3, %%mm2 \n\t"
2238 "movq %%mm0, %%mm1 \n\t"
2239 "movq %%mm2, %%mm3 \n\t"
2240 "psrlw $8, %%mm0 \n\t"
2241 "psrlw $8, %%mm2 \n\t"
2242 "pand %%mm7, %%mm1 \n\t"
2243 "pand %%mm7, %%mm3 \n\t"
2244 "packuswb %%mm2, %%mm0 \n\t"
2245 "packuswb %%mm3, %%mm1 \n\t"
2246 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2247 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2251 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2257 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2258 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2263 #if !COMPILE_TEMPLATE_AMD3DNOW
2273 "pcmpeqw %%mm7, %%mm7 \n\t"
2274 "psrlw $8, %%mm7 \n\t"
2276 "movq -28(%1, %0, 4), %%mm0 \n\t"
2277 "movq -20(%1, %0, 4), %%mm1 \n\t"
2278 "movq -12(%1, %0, 4), %%mm2 \n\t"
2279 "movq -4(%1, %0, 4), %%mm3 \n\t"
2280 "psrlw $8, %%mm0 \n\t"
2281 "psrlw $8, %%mm1 \n\t"
2282 "psrlw $8, %%mm2 \n\t"
2283 "psrlw $8, %%mm3 \n\t"
2284 "packuswb %%mm1, %%mm0 \n\t"
2285 "packuswb %%mm3, %%mm2 \n\t"
2286 "movq %%mm0, %%mm1 \n\t"
2287 "movq %%mm2, %%mm3 \n\t"
2288 "psrlw $8, %%mm0 \n\t"
2289 "psrlw $8, %%mm2 \n\t"
2290 "pand %%mm7, %%mm1 \n\t"
2291 "pand %%mm7, %%mm3 \n\t"
2292 "packuswb %%mm2, %%mm0 \n\t"
2293 "packuswb %%mm3, %%mm1 \n\t"
2294 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2295 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2299 :
"r"(src),
"r"(dst0),
"r"(dst1)
2305 dst0[count]= src[4*count+0];
2306 dst1[count]= src[4*count+2];
2323 "pcmpeqw %%mm7, %%mm7 \n\t"
2324 "psrlw $8, %%mm7 \n\t"
2326 "movq -28(%1, %0, 4), %%mm0 \n\t"
2327 "movq -20(%1, %0, 4), %%mm1 \n\t"
2328 "movq -12(%1, %0, 4), %%mm2 \n\t"
2329 "movq -4(%1, %0, 4), %%mm3 \n\t"
2330 PAVGB
" -28(%2, %0, 4), %%mm0 \n\t"
2331 PAVGB
" -20(%2, %0, 4), %%mm1 \n\t"
2332 PAVGB
" -12(%2, %0, 4), %%mm2 \n\t"
2333 PAVGB
" - 4(%2, %0, 4), %%mm3 \n\t"
2334 "psrlw $8, %%mm0 \n\t"
2335 "psrlw $8, %%mm1 \n\t"
2336 "psrlw $8, %%mm2 \n\t"
2337 "psrlw $8, %%mm3 \n\t"
2338 "packuswb %%mm1, %%mm0 \n\t"
2339 "packuswb %%mm3, %%mm2 \n\t"
2340 "movq %%mm0, %%mm1 \n\t"
2341 "movq %%mm2, %%mm3 \n\t"
2342 "psrlw $8, %%mm0 \n\t"
2343 "psrlw $8, %%mm2 \n\t"
2344 "pand %%mm7, %%mm1 \n\t"
2345 "pand %%mm7, %%mm3 \n\t"
2346 "packuswb %%mm2, %%mm0 \n\t"
2347 "packuswb %%mm3, %%mm1 \n\t"
2348 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2349 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2353 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2361 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2362 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2369 int lumStride,
int chromStride,
int srcStride)
2372 const int chromWidth= -((-
width)>>1);
2374 for (y=0; y<
height; y++) {
2392 #if !COMPILE_TEMPLATE_AMD3DNOW
2395 int lumStride,
int chromStride,
int srcStride)
2398 const int chromWidth= -((-
width)>>1);
2400 for (y=0; y<
height; y++) {
2419 int lumStride,
int chromStride,
int srcStride)
2422 const int chromWidth= -((-
width)>>1);
2424 for (y=0; y<
height; y++) {
2442 #if !COMPILE_TEMPLATE_AMD3DNOW
2445 int lumStride,
int chromStride,
int srcStride)
2448 const int chromWidth= -((-
width)>>1);
2450 for (y=0; y<
height; y++) {
2470 #if !COMPILE_TEMPLATE_SSE2
2471 #if !COMPILE_TEMPLATE_AMD3DNOW
2501 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2510 #if !COMPILE_TEMPLATE_AMD3DNOW
static void RENAME() rgb32tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() vu9_to_vu12(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride1, int srcStride2, int dstStride1, int dstStride2)
static void RENAME() uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() rgb16tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 2.
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
static void RENAME() rgb32tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb24tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuv422ptoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static void RENAME() extract_even2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb15to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
static void RENAME() yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 2.
static void RENAME() rgb24tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_odd2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() shuffle_bytes_2103(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12touyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16 (If this is a problem for anyon...
static void RENAME() yuvPlanartouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
static void RENAME() rgb24to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb16to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_even2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb2rgb_init(void)
static void RENAME() yuv422ptouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static void RENAME() yuvPlanartoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
static void RENAME() rgb16to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void(WINAPI *cond_broadcast)(pthread_cond_t *cond)
static void RENAME() extract_even(const uint8_t *src, uint8_t *dst, x86_reg count)
static void RENAME() rgb32tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() yvu9_to_yuy2(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, uint8_t *dst, int width, int height, int srcStride1, int srcStride2, int srcStride3, int dstStride)
static void RENAME() extract_odd2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
static void RENAME() rgb24to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() interleaveBytes(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride)