vc1dsp_mmx.c
Go to the documentation of this file.
1 /*
2  * VC-1 and WMV3 - DSP functions MMX-optimized
3  * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
4  *
5  * Permission is hereby granted, free of charge, to any person
6  * obtaining a copy of this software and associated documentation
7  * files (the "Software"), to deal in the Software without
8  * restriction, including without limitation the rights to use,
9  * copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following
12  * conditions:
13  *
14  * The above copyright notice and this permission notice shall be
15  * included in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24  * OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 #include "libavutil/cpu.h"
28 #include "libavutil/internal.h"
29 #include "libavutil/mem.h"
30 #include "libavutil/x86/asm.h"
31 #include "libavutil/x86/cpu.h"
32 #include "libavcodec/dsputil.h"
33 #include "dsputil_mmx.h"
34 #include "libavcodec/vc1dsp.h"
35 #include "vc1dsp.h"
36 
37 #if HAVE_INLINE_ASM
38 
39 #define OP_PUT(S,D)
40 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
41 
43 #define NORMALIZE_MMX(SHIFT) \
44  "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
45  "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
46  "psraw "SHIFT", %%mm3 \n\t" \
47  "psraw "SHIFT", %%mm4 \n\t"
48 
49 #define TRANSFER_DO_PACK(OP) \
50  "packuswb %%mm4, %%mm3 \n\t" \
51  OP((%2), %%mm3) \
52  "movq %%mm3, (%2) \n\t"
53 
54 #define TRANSFER_DONT_PACK(OP) \
55  OP(0(%2), %%mm3) \
56  OP(8(%2), %%mm4) \
57  "movq %%mm3, 0(%2) \n\t" \
58  "movq %%mm4, 8(%2) \n\t"
59 
61 #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
62 #define DONT_UNPACK(reg)
63 
65 #define LOAD_ROUNDER_MMX(ROUND) \
66  "movd "ROUND", %%mm7 \n\t" \
67  "punpcklwd %%mm7, %%mm7 \n\t" \
68  "punpckldq %%mm7, %%mm7 \n\t"
69 
70 #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
71  "paddw %%mm"#R2", %%mm"#R1" \n\t" \
72  "movd (%0,%3), %%mm"#R0" \n\t" \
73  "pmullw %%mm6, %%mm"#R1" \n\t" \
74  "punpcklbw %%mm0, %%mm"#R0" \n\t" \
75  "movd (%0,%2), %%mm"#R3" \n\t" \
76  "psubw %%mm"#R0", %%mm"#R1" \n\t" \
77  "punpcklbw %%mm0, %%mm"#R3" \n\t" \
78  "paddw %%mm7, %%mm"#R1" \n\t" \
79  "psubw %%mm"#R3", %%mm"#R1" \n\t" \
80  "psraw %4, %%mm"#R1" \n\t" \
81  "movq %%mm"#R1", "#OFF"(%1) \n\t" \
82  "add %2, %0 \n\t"
83 
85 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
86  const uint8_t *src, x86_reg stride,
87  int rnd, int64_t shift)
88 {
89  __asm__ volatile(
90  "mov $3, %%"REG_c" \n\t"
91  LOAD_ROUNDER_MMX("%5")
92  "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
93  "1: \n\t"
94  "movd (%0), %%mm2 \n\t"
95  "add %2, %0 \n\t"
96  "movd (%0), %%mm3 \n\t"
97  "punpcklbw %%mm0, %%mm2 \n\t"
98  "punpcklbw %%mm0, %%mm3 \n\t"
99  SHIFT2_LINE( 0, 1, 2, 3, 4)
100  SHIFT2_LINE( 24, 2, 3, 4, 1)
101  SHIFT2_LINE( 48, 3, 4, 1, 2)
102  SHIFT2_LINE( 72, 4, 1, 2, 3)
103  SHIFT2_LINE( 96, 1, 2, 3, 4)
104  SHIFT2_LINE(120, 2, 3, 4, 1)
105  SHIFT2_LINE(144, 3, 4, 1, 2)
106  SHIFT2_LINE(168, 4, 1, 2, 3)
107  "sub %6, %0 \n\t"
108  "add $8, %1 \n\t"
109  "dec %%"REG_c" \n\t"
110  "jnz 1b \n\t"
111  : "+r"(src), "+r"(dst)
112  : "r"(stride), "r"(-2*stride),
113  "m"(shift), "m"(rnd), "r"(9*stride-4)
114  : "%"REG_c, "memory"
115  );
116 }
117 
122 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
123 static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
124  const int16_t *src, int rnd)\
125 {\
126  int h = 8;\
127 \
128  src -= 1;\
129  rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
130  __asm__ volatile(\
131  LOAD_ROUNDER_MMX("%4")\
132  "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
133  "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
134  "1: \n\t"\
135  "movq 2*0+0(%1), %%mm1 \n\t"\
136  "movq 2*0+8(%1), %%mm2 \n\t"\
137  "movq 2*1+0(%1), %%mm3 \n\t"\
138  "movq 2*1+8(%1), %%mm4 \n\t"\
139  "paddw 2*3+0(%1), %%mm1 \n\t"\
140  "paddw 2*3+8(%1), %%mm2 \n\t"\
141  "paddw 2*2+0(%1), %%mm3 \n\t"\
142  "paddw 2*2+8(%1), %%mm4 \n\t"\
143  "pmullw %%mm5, %%mm3 \n\t"\
144  "pmullw %%mm5, %%mm4 \n\t"\
145  "psubw %%mm1, %%mm3 \n\t"\
146  "psubw %%mm2, %%mm4 \n\t"\
147  NORMALIZE_MMX("$7")\
148  /* Remove bias */\
149  "paddw %%mm6, %%mm3 \n\t"\
150  "paddw %%mm6, %%mm4 \n\t"\
151  TRANSFER_DO_PACK(OP)\
152  "add $24, %1 \n\t"\
153  "add %3, %2 \n\t"\
154  "decl %0 \n\t"\
155  "jnz 1b \n\t"\
156  : "+r"(h), "+r" (src), "+r" (dst)\
157  : "r"(stride), "m"(rnd)\
158  : "memory"\
159  );\
160 }
161 
162 VC1_HOR_16b_SHIFT2(OP_PUT, put_)
163 VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
164 
165 
170 #define VC1_SHIFT2(OP, OPNAME)\
171 static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
172  x86_reg stride, int rnd, x86_reg offset)\
173 {\
174  rnd = 8-rnd;\
175  __asm__ volatile(\
176  "mov $8, %%"REG_c" \n\t"\
177  LOAD_ROUNDER_MMX("%5")\
178  "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
179  "1: \n\t"\
180  "movd 0(%0 ), %%mm3 \n\t"\
181  "movd 4(%0 ), %%mm4 \n\t"\
182  "movd 0(%0,%2), %%mm1 \n\t"\
183  "movd 4(%0,%2), %%mm2 \n\t"\
184  "add %2, %0 \n\t"\
185  "punpcklbw %%mm0, %%mm3 \n\t"\
186  "punpcklbw %%mm0, %%mm4 \n\t"\
187  "punpcklbw %%mm0, %%mm1 \n\t"\
188  "punpcklbw %%mm0, %%mm2 \n\t"\
189  "paddw %%mm1, %%mm3 \n\t"\
190  "paddw %%mm2, %%mm4 \n\t"\
191  "movd 0(%0,%3), %%mm1 \n\t"\
192  "movd 4(%0,%3), %%mm2 \n\t"\
193  "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
194  "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
195  "punpcklbw %%mm0, %%mm1 \n\t"\
196  "punpcklbw %%mm0, %%mm2 \n\t"\
197  "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
198  "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
199  "movd 0(%0,%2), %%mm1 \n\t"\
200  "movd 4(%0,%2), %%mm2 \n\t"\
201  "punpcklbw %%mm0, %%mm1 \n\t"\
202  "punpcklbw %%mm0, %%mm2 \n\t"\
203  "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
204  "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
205  NORMALIZE_MMX("$4")\
206  "packuswb %%mm4, %%mm3 \n\t"\
207  OP((%1), %%mm3)\
208  "movq %%mm3, (%1) \n\t"\
209  "add %6, %0 \n\t"\
210  "add %4, %1 \n\t"\
211  "dec %%"REG_c" \n\t"\
212  "jnz 1b \n\t"\
213  : "+r"(src), "+r"(dst)\
214  : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
215  "g"(stride-offset)\
216  : "%"REG_c, "memory"\
217  );\
218 }
219 
220 VC1_SHIFT2(OP_PUT, put_)
221 VC1_SHIFT2(OP_AVG, avg_)
222 
233 #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
234  MOVQ "*0+"A1", %%mm1 \n\t" \
235  MOVQ "*4+"A1", %%mm2 \n\t" \
236  UNPACK("%%mm1") \
237  UNPACK("%%mm2") \
238  "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
239  "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
240  MOVQ "*0+"A2", %%mm3 \n\t" \
241  MOVQ "*4+"A2", %%mm4 \n\t" \
242  UNPACK("%%mm3") \
243  UNPACK("%%mm4") \
244  "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
245  "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
246  "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
247  "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
248  MOVQ "*0+"A4", %%mm1 \n\t" \
249  MOVQ "*4+"A4", %%mm2 \n\t" \
250  UNPACK("%%mm1") \
251  UNPACK("%%mm2") \
252  "psllw $2, %%mm1 \n\t" /* 4* */ \
253  "psllw $2, %%mm2 \n\t" /* 4* */ \
254  "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
255  "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
256  MOVQ "*0+"A3", %%mm1 \n\t" \
257  MOVQ "*4+"A3", %%mm2 \n\t" \
258  UNPACK("%%mm1") \
259  UNPACK("%%mm2") \
260  "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
261  "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
262  "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
263  "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
264 
273 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
274 static void \
275 vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
276  x86_reg src_stride, \
277  int rnd, int64_t shift) \
278 { \
279  int h = 8; \
280  src -= src_stride; \
281  __asm__ volatile( \
282  LOAD_ROUNDER_MMX("%5") \
283  "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
284  "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
285  ".p2align 3 \n\t" \
286  "1: \n\t" \
287  MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
288  NORMALIZE_MMX("%6") \
289  TRANSFER_DONT_PACK(OP_PUT) \
290  /* Last 3 (in fact 4) bytes on the line */ \
291  "movd 8+"A1", %%mm1 \n\t" \
292  DO_UNPACK("%%mm1") \
293  "movq %%mm1, %%mm3 \n\t" \
294  "paddw %%mm1, %%mm1 \n\t" \
295  "paddw %%mm3, %%mm1 \n\t" /* 3* */ \
296  "movd 8+"A2", %%mm3 \n\t" \
297  DO_UNPACK("%%mm3") \
298  "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
299  "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
300  "movd 8+"A3", %%mm1 \n\t" \
301  DO_UNPACK("%%mm1") \
302  "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
303  "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
304  "movd 8+"A4", %%mm1 \n\t" \
305  DO_UNPACK("%%mm1") \
306  "psllw $2, %%mm1 \n\t" /* 4* */ \
307  "psubw %%mm1, %%mm3 \n\t" \
308  "paddw %%mm7, %%mm3 \n\t" \
309  "psraw %6, %%mm3 \n\t" \
310  "movq %%mm3, 16(%2) \n\t" \
311  "add %3, %1 \n\t" \
312  "add $24, %2 \n\t" \
313  "decl %0 \n\t" \
314  "jnz 1b \n\t" \
315  : "+r"(h), "+r" (src), "+r" (dst) \
316  : "r"(src_stride), "r"(3*src_stride), \
317  "m"(rnd), "m"(shift) \
318  : "memory" \
319  ); \
320 }
321 
329 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
330 static void \
331 OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
332  const int16_t *src, int rnd) \
333 { \
334  int h = 8; \
335  src -= 1; \
336  rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
337  __asm__ volatile( \
338  LOAD_ROUNDER_MMX("%4") \
339  "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
340  "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
341  ".p2align 3 \n\t" \
342  "1: \n\t" \
343  MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
344  NORMALIZE_MMX("$7") \
345  /* Remove bias */ \
346  "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
347  "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
348  TRANSFER_DO_PACK(OP) \
349  "add $24, %1 \n\t" \
350  "add %3, %2 \n\t" \
351  "decl %0 \n\t" \
352  "jnz 1b \n\t" \
353  : "+r"(h), "+r" (src), "+r" (dst) \
354  : "r"(stride), "m"(rnd) \
355  : "memory" \
356  ); \
357 }
358 
367 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
368 static void \
369 OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
370  x86_reg stride, int rnd, x86_reg offset) \
371 { \
372  int h = 8; \
373  src -= offset; \
374  rnd = 32-rnd; \
375  __asm__ volatile ( \
376  LOAD_ROUNDER_MMX("%6") \
377  "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
378  "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
379  ".p2align 3 \n\t" \
380  "1: \n\t" \
381  MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
382  NORMALIZE_MMX("$6") \
383  TRANSFER_DO_PACK(OP) \
384  "add %5, %1 \n\t" \
385  "add %5, %2 \n\t" \
386  "decl %0 \n\t" \
387  "jnz 1b \n\t" \
388  : "+r"(h), "+r" (src), "+r" (dst) \
389  : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
390  : "memory" \
391  ); \
392 }
393 
395 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
396 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
397 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
398 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
399 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
400 
402 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
403 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
404 MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
405 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
406 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
407 
408 typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
409 typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
410 typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
411 
423 #define VC1_MSPEL_MC(OP)\
424 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
425  int hmode, int vmode, int rnd)\
426 {\
427  static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
428  { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
429  static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
430  { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
431  static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
432  { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
433 \
434  __asm__ volatile(\
435  "pxor %%mm0, %%mm0 \n\t"\
436  ::: "memory"\
437  );\
438 \
439  if (vmode) { /* Vertical filter to apply */\
440  if (hmode) { /* Horizontal filter to apply, output to tmp */\
441  static const int shift_value[] = { 0, 5, 1, 5 };\
442  int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
443  int r;\
444  DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
445 \
446  r = (1<<(shift-1)) + rnd-1;\
447  vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
448 \
449  vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
450  return;\
451  }\
452  else { /* No horizontal filter, output 8 lines to dst */\
453  vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
454  return;\
455  }\
456  }\
457 \
458  /* Horizontal mode with no vertical mode */\
459  vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
460 }
461 
462 VC1_MSPEL_MC(put_)
463 VC1_MSPEL_MC(avg_)
464 
466 #define DECLARE_FUNCTION(a, b) \
467 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
468  put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
469 }\
470 static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
471  const uint8_t *src, \
472  int stride, int rnd) \
473 { \
474  avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
475 }
476 
477 DECLARE_FUNCTION(0, 1)
478 DECLARE_FUNCTION(0, 2)
479 DECLARE_FUNCTION(0, 3)
480 
481 DECLARE_FUNCTION(1, 0)
482 DECLARE_FUNCTION(1, 1)
483 DECLARE_FUNCTION(1, 2)
484 DECLARE_FUNCTION(1, 3)
485 
486 DECLARE_FUNCTION(2, 0)
487 DECLARE_FUNCTION(2, 1)
488 DECLARE_FUNCTION(2, 2)
489 DECLARE_FUNCTION(2, 3)
490 
491 DECLARE_FUNCTION(3, 0)
492 DECLARE_FUNCTION(3, 1)
493 DECLARE_FUNCTION(3, 2)
494 DECLARE_FUNCTION(3, 3)
495 
496 static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
497  DCTELEM *block)
498 {
499  int dc = block[0];
500  dc = (17 * dc + 4) >> 3;
501  dc = (17 * dc + 64) >> 7;
502  __asm__ volatile(
503  "movd %0, %%mm0 \n\t"
504  "pshufw $0, %%mm0, %%mm0 \n\t"
505  "pxor %%mm1, %%mm1 \n\t"
506  "psubw %%mm0, %%mm1 \n\t"
507  "packuswb %%mm0, %%mm0 \n\t"
508  "packuswb %%mm1, %%mm1 \n\t"
509  ::"r"(dc)
510  );
511  __asm__ volatile(
512  "movd %0, %%mm2 \n\t"
513  "movd %1, %%mm3 \n\t"
514  "movd %2, %%mm4 \n\t"
515  "movd %3, %%mm5 \n\t"
516  "paddusb %%mm0, %%mm2 \n\t"
517  "paddusb %%mm0, %%mm3 \n\t"
518  "paddusb %%mm0, %%mm4 \n\t"
519  "paddusb %%mm0, %%mm5 \n\t"
520  "psubusb %%mm1, %%mm2 \n\t"
521  "psubusb %%mm1, %%mm3 \n\t"
522  "psubusb %%mm1, %%mm4 \n\t"
523  "psubusb %%mm1, %%mm5 \n\t"
524  "movd %%mm2, %0 \n\t"
525  "movd %%mm3, %1 \n\t"
526  "movd %%mm4, %2 \n\t"
527  "movd %%mm5, %3 \n\t"
528  :"+m"(*(uint32_t*)(dest+0*linesize)),
529  "+m"(*(uint32_t*)(dest+1*linesize)),
530  "+m"(*(uint32_t*)(dest+2*linesize)),
531  "+m"(*(uint32_t*)(dest+3*linesize))
532  );
533 }
534 
535 static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
536  DCTELEM *block)
537 {
538  int dc = block[0];
539  dc = (17 * dc + 4) >> 3;
540  dc = (12 * dc + 64) >> 7;
541  __asm__ volatile(
542  "movd %0, %%mm0 \n\t"
543  "pshufw $0, %%mm0, %%mm0 \n\t"
544  "pxor %%mm1, %%mm1 \n\t"
545  "psubw %%mm0, %%mm1 \n\t"
546  "packuswb %%mm0, %%mm0 \n\t"
547  "packuswb %%mm1, %%mm1 \n\t"
548  ::"r"(dc)
549  );
550  __asm__ volatile(
551  "movd %0, %%mm2 \n\t"
552  "movd %1, %%mm3 \n\t"
553  "movd %2, %%mm4 \n\t"
554  "movd %3, %%mm5 \n\t"
555  "paddusb %%mm0, %%mm2 \n\t"
556  "paddusb %%mm0, %%mm3 \n\t"
557  "paddusb %%mm0, %%mm4 \n\t"
558  "paddusb %%mm0, %%mm5 \n\t"
559  "psubusb %%mm1, %%mm2 \n\t"
560  "psubusb %%mm1, %%mm3 \n\t"
561  "psubusb %%mm1, %%mm4 \n\t"
562  "psubusb %%mm1, %%mm5 \n\t"
563  "movd %%mm2, %0 \n\t"
564  "movd %%mm3, %1 \n\t"
565  "movd %%mm4, %2 \n\t"
566  "movd %%mm5, %3 \n\t"
567  :"+m"(*(uint32_t*)(dest+0*linesize)),
568  "+m"(*(uint32_t*)(dest+1*linesize)),
569  "+m"(*(uint32_t*)(dest+2*linesize)),
570  "+m"(*(uint32_t*)(dest+3*linesize))
571  );
572  dest += 4*linesize;
573  __asm__ volatile(
574  "movd %0, %%mm2 \n\t"
575  "movd %1, %%mm3 \n\t"
576  "movd %2, %%mm4 \n\t"
577  "movd %3, %%mm5 \n\t"
578  "paddusb %%mm0, %%mm2 \n\t"
579  "paddusb %%mm0, %%mm3 \n\t"
580  "paddusb %%mm0, %%mm4 \n\t"
581  "paddusb %%mm0, %%mm5 \n\t"
582  "psubusb %%mm1, %%mm2 \n\t"
583  "psubusb %%mm1, %%mm3 \n\t"
584  "psubusb %%mm1, %%mm4 \n\t"
585  "psubusb %%mm1, %%mm5 \n\t"
586  "movd %%mm2, %0 \n\t"
587  "movd %%mm3, %1 \n\t"
588  "movd %%mm4, %2 \n\t"
589  "movd %%mm5, %3 \n\t"
590  :"+m"(*(uint32_t*)(dest+0*linesize)),
591  "+m"(*(uint32_t*)(dest+1*linesize)),
592  "+m"(*(uint32_t*)(dest+2*linesize)),
593  "+m"(*(uint32_t*)(dest+3*linesize))
594  );
595 }
596 
597 static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
598  DCTELEM *block)
599 {
600  int dc = block[0];
601  dc = ( 3 * dc + 1) >> 1;
602  dc = (17 * dc + 64) >> 7;
603  __asm__ volatile(
604  "movd %0, %%mm0 \n\t"
605  "pshufw $0, %%mm0, %%mm0 \n\t"
606  "pxor %%mm1, %%mm1 \n\t"
607  "psubw %%mm0, %%mm1 \n\t"
608  "packuswb %%mm0, %%mm0 \n\t"
609  "packuswb %%mm1, %%mm1 \n\t"
610  ::"r"(dc)
611  );
612  __asm__ volatile(
613  "movq %0, %%mm2 \n\t"
614  "movq %1, %%mm3 \n\t"
615  "movq %2, %%mm4 \n\t"
616  "movq %3, %%mm5 \n\t"
617  "paddusb %%mm0, %%mm2 \n\t"
618  "paddusb %%mm0, %%mm3 \n\t"
619  "paddusb %%mm0, %%mm4 \n\t"
620  "paddusb %%mm0, %%mm5 \n\t"
621  "psubusb %%mm1, %%mm2 \n\t"
622  "psubusb %%mm1, %%mm3 \n\t"
623  "psubusb %%mm1, %%mm4 \n\t"
624  "psubusb %%mm1, %%mm5 \n\t"
625  "movq %%mm2, %0 \n\t"
626  "movq %%mm3, %1 \n\t"
627  "movq %%mm4, %2 \n\t"
628  "movq %%mm5, %3 \n\t"
629  :"+m"(*(uint32_t*)(dest+0*linesize)),
630  "+m"(*(uint32_t*)(dest+1*linesize)),
631  "+m"(*(uint32_t*)(dest+2*linesize)),
632  "+m"(*(uint32_t*)(dest+3*linesize))
633  );
634 }
635 
636 static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
637  DCTELEM *block)
638 {
639  int dc = block[0];
640  dc = (3 * dc + 1) >> 1;
641  dc = (3 * dc + 16) >> 5;
642  __asm__ volatile(
643  "movd %0, %%mm0 \n\t"
644  "pshufw $0, %%mm0, %%mm0 \n\t"
645  "pxor %%mm1, %%mm1 \n\t"
646  "psubw %%mm0, %%mm1 \n\t"
647  "packuswb %%mm0, %%mm0 \n\t"
648  "packuswb %%mm1, %%mm1 \n\t"
649  ::"r"(dc)
650  );
651  __asm__ volatile(
652  "movq %0, %%mm2 \n\t"
653  "movq %1, %%mm3 \n\t"
654  "movq %2, %%mm4 \n\t"
655  "movq %3, %%mm5 \n\t"
656  "paddusb %%mm0, %%mm2 \n\t"
657  "paddusb %%mm0, %%mm3 \n\t"
658  "paddusb %%mm0, %%mm4 \n\t"
659  "paddusb %%mm0, %%mm5 \n\t"
660  "psubusb %%mm1, %%mm2 \n\t"
661  "psubusb %%mm1, %%mm3 \n\t"
662  "psubusb %%mm1, %%mm4 \n\t"
663  "psubusb %%mm1, %%mm5 \n\t"
664  "movq %%mm2, %0 \n\t"
665  "movq %%mm3, %1 \n\t"
666  "movq %%mm4, %2 \n\t"
667  "movq %%mm5, %3 \n\t"
668  :"+m"(*(uint32_t*)(dest+0*linesize)),
669  "+m"(*(uint32_t*)(dest+1*linesize)),
670  "+m"(*(uint32_t*)(dest+2*linesize)),
671  "+m"(*(uint32_t*)(dest+3*linesize))
672  );
673  dest += 4*linesize;
674  __asm__ volatile(
675  "movq %0, %%mm2 \n\t"
676  "movq %1, %%mm3 \n\t"
677  "movq %2, %%mm4 \n\t"
678  "movq %3, %%mm5 \n\t"
679  "paddusb %%mm0, %%mm2 \n\t"
680  "paddusb %%mm0, %%mm3 \n\t"
681  "paddusb %%mm0, %%mm4 \n\t"
682  "paddusb %%mm0, %%mm5 \n\t"
683  "psubusb %%mm1, %%mm2 \n\t"
684  "psubusb %%mm1, %%mm3 \n\t"
685  "psubusb %%mm1, %%mm4 \n\t"
686  "psubusb %%mm1, %%mm5 \n\t"
687  "movq %%mm2, %0 \n\t"
688  "movq %%mm3, %1 \n\t"
689  "movq %%mm4, %2 \n\t"
690  "movq %%mm5, %3 \n\t"
691  :"+m"(*(uint32_t*)(dest+0*linesize)),
692  "+m"(*(uint32_t*)(dest+1*linesize)),
693  "+m"(*(uint32_t*)(dest+2*linesize)),
694  "+m"(*(uint32_t*)(dest+3*linesize))
695  );
696 }
697 
699 {
701  dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
702  dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
703  dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
704 
705  dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
706  dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
707  dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
708  dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
709 
710  dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
711  dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
712  dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
713  dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
714 
715  dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
716  dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
717  dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
718  dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
719 }
720 
722 {
724  dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
725  dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
726  dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
727 
728  dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
729  dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
730  dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
731  dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
732 
733  dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
734  dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
735  dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
736  dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
737 
738  dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
739  dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
740  dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
741  dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
742 
743  dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
744  dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
745  dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
746  dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
747 }
748 #endif /* HAVE_INLINE_ASM */
op_pixels_func avg_vc1_mspel_pixels_tab[16]
Definition: vc1dsp.h:58
#define MANGLE(a)
Definition: asm.h:108
VC-1 and WMV3 decoder.
memory handling functions
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd)
const xmm_reg ff_pw_9
Definition: dsputil_mmx.c:50
int stride
Definition: mace.c:144
void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
uint8_t
void(* vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, DCTELEM *block)
Definition: vc1dsp.h:40
void(* vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, DCTELEM *block)
Definition: vc1dsp.h:41
#define b
Definition: input.c:52
void(* vc1_inv_trans_8x8_dc)(uint8_t *dest, int line_size, DCTELEM *block)
Definition: vc1dsp.h:39
static float t
#define r
Definition: input.c:51
static const int shift1[6]
Definition: dxa.c:48
void(* vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, DCTELEM *block)
Definition: vc1dsp.h:42
common internal API header
static DCTELEM block[64]
Definition: dct-test.c:169
void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
#define VC1_MSPEL_MC(OP, OPNAME)
Function used to do motion compensation with bicubic interpolation.
Definition: vc1dsp.c:564
#define OP_AVG(x, s, l)
void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src, int stride, int rnd)
short DCTELEM
Definition: dsputil.h:39
DSP utils.
int x86_reg
Definition: asm.h:68
#define OP_PUT(a, b)
Definition: ivi_dsp.c:609
op_pixels_func put_vc1_mspel_pixels_tab[16]
Definition: vc1dsp.h:57