1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
|
diff -Nurp libfame-0.9.1/src/dct_mmx.h libfame-0.9.1-pic/src/dct_mmx.h
--- libfame-0.9.1/src/dct_mmx.h 2002-04-14 12:22:05.000000000 +0100
+++ libfame-0.9.1-pic/src/dct_mmx.h 2005-04-24 00:48:52.000000000 +0100
@@ -22,6 +22,9 @@
#define precision
+extern FAME_ALIGNED short const _mmx_1[];
+extern FAME_ALIGNED short const _mmx_cos[];
+
static void inline dct_aan_pass(dct_t *cache)
{
// register unsigned short const *mmx_cos = _mmx_cos;
@@ -66,42 +69,42 @@ static void inline dct_aan_pass(dct_t *c
#ifdef precision
"psllw $0x01, %%mm5\n" /* precision(va0) += 1 bit */
#endif
- "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */
+ "paddw (%2), %%mm4\n" /* + 1 */
// "pmulhw 16(%1), %%mm5\n" /* (v14+v16)*COS6 -> mm5 (va0) */
- "pmulhw " ASMSYM "_mmx_cos+16, %%mm5\n" /* (v14+v16)*COS6 -> mm5 (va0) */
+ "pmulhw 16(%3), %%mm5\n" /* (v14+v16)*COS6 -> mm5 (va0) */
"" /* STEP 4 */
#ifdef precision
"psllw $0x02, %%mm6\n" /* precision(v22) += 1 bit */
#else
"psllw $0x01, %%mm6\n" /* */
#endif
- "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */
+ "paddw (%2), %%mm4\n" /* + 1 */
// "pmulhw 8(%1), %%mm6\n" /* 2*v22*COS4/2 -> mm6 (v32)*/
- "pmulhw " ASMSYM "_mmx_cos+8, %%mm6\n" /* 2*v22*COS4/2 -> mm6 (v32)*/
+ "pmulhw 8(%3), %%mm6\n" /* 2*v22*COS4/2 -> mm6 (v32)*/
#ifdef precision
"psllw $0x02, %%mm2\n" /* precision(v15) += 1 bit */
#else
"psllw $0x01, %%mm2\n" /* */
#endif
- "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */
+ "paddw (%2), %%mm4\n" /* + 1 */
// "pmulhw 8(%1), %%mm2\n" /* 2*v15*COS4/2 -> mm2 (v35) */
- "pmulhw " ASMSYM "_mmx_cos+8, %%mm2\n" /* 2*v15*COS4/2 -> mm2 (v35) */
+ "pmulhw 8(%3), %%mm2\n" /* 2*v15*COS4/2 -> mm2 (v35) */
#ifdef precision
"psllw $0x02, %%mm4\n" /* precision(v14) += 1 bit */
#else
"psllw $0x01, %%mm4\n" /* */
#endif
- "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */
+ "paddw (%2), %%mm4\n" /* + 1 */
// "pmulhw 0(%1), %%mm4\n" /* 2 * v14 * -COS2/2 -> mm4 */
- "pmulhw " ASMSYM "_mmx_cos, %%mm4\n" /* 2 * v14 * -COS2/2 -> mm4 */
+ "pmulhw (%3), %%mm4\n" /* 2 * v14 * -COS2/2 -> mm4 */
"psubsw %%mm5, %%mm4\n" /* v14*-COS2 - va0 -> mm4 (v34) */
#ifdef precision
"psllw $0x01, %%mm1\n" /* precision(v16) += 1 bit */
#endif
"psubsw %%mm1, %%mm5\n" /* va0 - v16 -> mm5 */
- "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */
+ "paddw (%2), %%mm4\n" /* + 1 */
// "pmulhw 24(%1), %%mm1\n" /* v16 * (COS8 - 1) -> mm1 */
- "pmulhw " ASMSYM "_mmx_cos+24, %%mm1\n" /* v16 * (COS8 - 1) -> mm1 */
+ "pmulhw 24(%3), %%mm1\n" /* v16 * (COS8 - 1) -> mm1 */
"psubsw %%mm5, %%mm1\n" /* v16 * COS8 - va0 -> mm1 (v36)*/
"" /* STEP 5 */
"movq 0x70(%0), %%mm0\n" /* retrieve v07 -> mm0 */
@@ -138,8 +141,8 @@ static void inline dct_aan_pass(dct_t *c
"movq %%mm0, 0x30(%0)\n" /* store line 3 */
"movq %%mm4, 0x50(%0)\n" /* store line 5 */
"movq %%mm2, 0x70(%0)\n" /* store line 7 */
- : "=r"(cache)/*, "=r"(mmx_cos)*/
- : "0"(cache)/*, "1"(mmx_cos)*/
+ : "=r"(cache)
+ : "0"(cache), "r"(_mmx_1), "r"(_mmx_cos)
: "memory");
}
diff -Nurp libfame-0.9.1/src/dequantize_mmx.h libfame-0.9.1-pic/src/dequantize_mmx.h
--- libfame-0.9.1/src/dequantize_mmx.h 2002-04-23 22:40:56.000000000 +0100
+++ libfame-0.9.1-pic/src/dequantize_mmx.h 2005-04-24 00:44:26.000000000 +0100
@@ -27,8 +27,8 @@
"pmullw 0x" #x "8(%3), %%mm5\n" /* premultiply for iDCT */ \
"psrlw $0x0b, %%mm4\n" /* keep 5 bits */ \
"psrlw $0x0b, %%mm5\n" /* keep 5 bits */ \
- "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ \
- "paddw " ASMSYM "_mmx_1, %%mm5\n" /* + 1 */ \
+ "paddw (%8), %%mm4\n" /* + 1 */ \
+ "paddw (%8), %%mm5\n" /* + 1 */ \
"psrlw $0x01, %%mm4\n" /* keep 4 bits rounded */ \
"psrlw $0x01, %%mm5\n" /* keep 4 bits rounded */ \
"psllw $0x04, %%mm0\n" /* multiply by 16 for iDCT */ \
@@ -107,7 +107,7 @@ static void inline dequantize_intra_glob
DEQUANTIZE_GLOBAL_MISMATCH_CONTROL()
DEQUANTIZE_PRESCALE_STEP(7)
: "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
- : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
+ : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1)
: "memory");
asm volatile("movd %%mm6, %0\n" /* export mismatch */
@@ -160,8 +160,8 @@ static void inline dequantize_intra_loca
"pcmpeqw %%mm7, %%mm3\n" /* invert sign */ \
"paddw %%mm2, %%mm0\n" /* sub 1 if >0 */ \
"paddw %%mm3, %%mm1\n" /* sub 1 if >0 */ \
- "por " ASMSYM "_mmx_1, %%mm0\n" /* or 1 */ \
- "por " ASMSYM "_mmx_1, %%mm1\n" /* or 1 */ \
+ "por (%8), %%mm0\n" /* or 1 */ \
+ "por (%8), %%mm1\n" /* or 1 */ \
"pand %%mm4, %%mm0\n" /* [0-3]=0 if [0-3] was zero */ \
"pand %%mm5, %%mm1\n" /* [4-7]=0 if [4-7] was zero */
@@ -184,7 +184,7 @@ static void inline dequantize_intra_loca
DEQUANTIZE_INTRA_LOCAL_STEP(7)
DEQUANTIZE_PRESCALE_STEP(7)
: "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
- : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
+ : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1)
: "memory");
}
@@ -256,7 +256,7 @@ static void inline dequantize_inter_glob
/* resetting the accumulator when the block is coded intra */
DEQUANTIZE_PRESCALE_STEP(7)
: "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
- : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
+ : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1)
: "memory");
asm volatile("movd %%mm6, %0\n" /* export mismatch */
@@ -324,8 +324,8 @@ static void inline dequantize_inter_loca
"pcmpeqw %%mm7, %%mm3\n" /* invert sign */ \
"paddw %%mm2, %%mm0\n" /* sub 1 if >0 */ \
"paddw %%mm3, %%mm1\n" /* sub 1 if >0 */ \
- "por " ASMSYM "_mmx_1, %%mm0\n" /* or 1 */ \
- "por " ASMSYM "_mmx_1, %%mm1\n" /* or 1 */ \
+ "por (%8), %%mm0\n" /* or 1 */ \
+ "por (%8), %%mm1\n" /* or 1 */ \
"pand %%mm4, %%mm0\n" /* [0-3]=0 if [0-3] was zero */ \
"pand %%mm5, %%mm1\n" /* [4-7]=0 if [4-7] was zero */
@@ -348,6 +348,6 @@ static void inline dequantize_inter_loca
DEQUANTIZE_INTER_LOCAL_STEP(7)
DEQUANTIZE_PRESCALE_STEP(7)
: "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
- : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
+ : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1)
: "memory");
}
diff -Nurp libfame-0.9.1/src/fame_syntax_mpeg1.c libfame-0.9.1-pic/src/fame_syntax_mpeg1.c
--- libfame-0.9.1/src/fame_syntax_mpeg1.c 2002-10-05 13:44:47.000000000 +0100
+++ libfame-0.9.1-pic/src/fame_syntax_mpeg1.c 2005-04-24 00:19:09.000000000 +0100
@@ -469,89 +469,6 @@ static void mpeg1_block_intra(fame_synta
fast_bitbuffer_write(data, shift, table[v+255].code, table[v+255].length);
/* encode AC coefficients */
-#if defined(HAS_BSWAP)
- {
- unsigned long dummy1, dummy2;
-
- /* Note:
- movsx mpeg1_table_clip+4096(, %%eax ,2), %%eax
- has been replaced by
- movw mpeg1_table_clip+4096(, %%eax ,2), %%ax
- movsx %%ax, %%eax
- because the first instruction failed on a PIII!! (wrong sign extension)
- whereas it worked well on my P75 :)
- */
- /* Ok, a bit of explanations for a couple of tricks:
- The DC value of block is already coded and stored in v so we can use it to store something.
- We add one index to the zigzag table so that after coding block[63] we go to index 0. There
- we need to escape the zero counting loop (1), what we ensure by putting a non-zero value in
- the DC coefficient. Then we can test for index == 0 to exit.
- Now this non-zero value is a bit special :)
- In order to have one more 'half' register, we store sp value (16 less significant bit of the
- 32 bit register esp) *plus one* in the DC coefficient. Since the stack is aligned at an
- address multiple of 4 bytes (at least), we are sure that sp != 0xffff and thus sp+1 will
- never be zero. We then retrieve sp at the end for it is needed by 'pop' instructions.
- */
- /* TODO : echange the role of edx and esp */
- __asm__ __volatile__ ("pushl %%ebx\n" /* save ebx */
- "pushl %%ebp\n" /* save stack pointer */
- "inc %%sp\n" /* make sure sp != 0 */
- "movw %%sp, (%%edx)\n" /* store sp+1 in DC ;) */
- "movl %%esi, %%ebp\n" /* ebp = vlc_table */
- "xorl %%eax, %%eax\n" /* eax = 0 */
- "movl $" ASMSYM "mpeg1_zigzag_table+1, %%esi\n" /*esi = zigzag*/
- "lea 1(%%esi), %%ebx\n" /* ebx = zigzag_table+1*/
- "neg %%ebx\n" /* ebx = -(esi+1) */
- ".p2align 4,,7\n" /* align for jump */
- "0: xorw %%sp, %%sp\n" /* sp = 0 */
- "1: movb (%%esi), %%al\n" /* eax = index in block*/
- "incl %%esi\n" /* (faster than lodsb) */
- "addw (%%edx, %%eax, 2), %%sp\n" /* sp = unzig */
- "jz 1b\n" /* coeff == 0 then loop*/
- "orl %%eax, %%eax\n" /* index == 0 then quit*/
- "jz 2f\n" /* (faster than jcxz) */
- "movsx %%sp, %%eax\n" /* extend sign */
- "movw " ASMSYM "mpeg1_table_clip_data+4096(, %%eax ,2), %%ax\n" /*clip*/
- "movsx %%ax, %%eax\n" /* extend sign */
- "addl %%esi, %%ebx\n" /* ebx = run */
- "shll $7, %%eax\n" /* eax *= 128(indexing)*/
- "lea (%%eax, %%ebx, 2), %%eax\n" /*eax = 2 * offset*/
- "lea (%%ebp, %%eax, 4), %%ebx\n" /* ebx = &vlc */
- "movl (%%ebx), %%eax\n" /* eax = code */
- "addl 4(%%ebx), %%ecx\n" /* ecx = shift+=length */
- "xorl %%ebx, %%ebx\n" /* ebx = 0 */
- "shrd %%cl, %%eax, %%ebx\n" /* adjust code to fit */
- "shr %%cl, %%eax\n" /* adjust code to fit */
- "bswap %%eax\n" /* reverse byte order of code */
- "bswap %%ebx\n" /* reverse byte order of code */
- "or %%eax, (%%edi)\n" /* put first 32 bits */
- "movl %%ecx, %%eax\n" /* eax = shift + length*/
- "shrl $5, %%eax\n" /* get dword increment */
- "andl $31, %%ecx\n" /* mask shift */
- "lea (%%edi, %%eax, 4), %%edi\n"/* data+=(ecx>32)*/
- "orl %%ebx, (%%edi)\n" /* put last 32 bits */
- "xorl %%eax, %%eax\n" /* eax = 0 */
- "lea 1(%%esi), %%ebx\n" /* ebx = esi + 1 (last)*/
- "neg %%ebx\n" /* ebx = -(esi + 1) */
- "jmp 0b\n" /* loop */
- "2:\n"
- "movw (%%edx), %%sp\n" /* retrieve sp+1 */
- "dec %%sp\n" /* restore esp */
- "popl %%ebp\n" /* reload stack pointer*/
- "popl %%ebx\n" /* reload ebx */
- : "=c"(shift),
- "=a"(dummy1),
- "=d"(block),
- "=D"(data),
- "=S"(dummy2)
- : "d"(block),
- "c"(shift),
- "D"(data),
- "S"(syntax_mpeg1->vlc_table)
- : "memory");
- block[0] = v; /* restore DC value */
- }
-#else
{
short i;
unsigned long last;
@@ -573,7 +490,6 @@ static void mpeg1_block_intra(fame_synta
}
}
}
-#endif /* HAS_BSWAP */
/* mark end of block */
fast_bitbuffer_write(data, shift, 2, 2);
diff -Nurp libfame-0.9.1/src/half_mmx.h libfame-0.9.1-pic/src/half_mmx.h
--- libfame-0.9.1/src/half_mmx.h 2002-04-30 19:04:02.000000000 +0100
+++ libfame-0.9.1-pic/src/half_mmx.h 2005-04-24 00:44:49.000000000 +0100
@@ -68,8 +68,8 @@ static void inline mmx_interpolate(unsig
"paddw %%mm5, %%mm6\n" /* mm6 = ref00+ref10+ref11+1-r 4-7*/
"psrlw $1, %%mm4\n" /* divide by 2 */
"psrlw $1, %%mm5\n" /* divide by 2 */
- "paddw " ASMSYM "_mmx_one, %%mm3\n" /* add 1 */
- "paddw " ASMSYM "_mmx_one, %%mm6\n" /* add 1 */
+ "paddw (%8), %%mm3\n" /* add 1 */
+ "paddw (%8), %%mm6\n" /* add 1 */
"packuswb %%mm5, %%mm4\n" /* pack to byte and saturate */
"movq 1(%3), %%mm1\n" /* mm1 = [ref+1] */
"movq %%mm1, %%mm2\n" /* mm2 = mm1 */
@@ -87,7 +87,7 @@ static void inline mmx_interpolate(unsig
"movl 12(%0), %3\n" /* %3 = ref[3] */
"movq %%mm3, (%3)\n" /* store in frame */
: "=r"(ref), "=r"(pitch), "=r"(rc), "=r"(dummy)
- : "0"(ref), "1"(pitch), "2"(rc), "3"(dummy)
+ : "0"(ref), "1"(pitch), "2"(rc), "3"(dummy), "r"(_mmx_one)
: "memory");
}
diff -Nurp libfame-0.9.1/src/half_sse.h libfame-0.9.1-pic/src/half_sse.h
--- libfame-0.9.1/src/half_sse.h 2002-01-27 02:24:56.000000000 +0000
+++ libfame-0.9.1-pic/src/half_sse.h 2005-04-24 00:57:08.000000000 +0100
@@ -71,8 +71,8 @@ static void inline mmx_interpolate_signe
"paddw %%mm5, %%mm6\n" /* mm6 = ref00+ref10+ref11+1-r 4-7*/
"psrlw $1, %%mm4\n" /* divide by 2 */
"psrlw $1, %%mm5\n" /* divide by 2 */
- "paddw " ASMSYM "_mmx_one, %%mm3\n" /* add 1 */
- "paddw " ASMSYM "_mmx_one, %%mm6\n" /* add 1 */
+ "paddw (%8), %%mm3\n" /* add 1 */
+ "paddw (%8), %%mm6\n" /* add 1 */
"packuswb %%mm5, %%mm4\n" /* pack to byte and saturate */
"movq 1(%3), %%mm1\n" /* mm1 = [ref+1] */
"movq %%mm1, %%mm2\n" /* mm2 = mm1 */
@@ -90,7 +90,7 @@ static void inline mmx_interpolate_signe
"movl 12(%0), %3\n" /* %3 = ref[3] */
"movq %%mm3, (%3)\n" /* store in frame */
: "=r"(ref), "=r"(pitch), "=r"(rc), "=r"(dummy)
- : "0"(ref), "1"(pitch), "2"(rc), "3"(dummy)
+ : "0"(ref), "1"(pitch), "2"(rc), "3"(dummy), "r"(_mmx_one)
: "memory");
}
diff -Nurp libfame-0.9.1/src/idct_mmx.h libfame-0.9.1-pic/src/idct_mmx.h
--- libfame-0.9.1/src/idct_mmx.h 2002-04-14 12:22:05.000000000 +0100
+++ libfame-0.9.1-pic/src/idct_mmx.h 2005-04-24 00:51:00.000000000 +0100
@@ -18,6 +18,10 @@
*/
/*************************** MMX accelerated iDCT ****************************/
+extern FAME_ALIGNED short const _mmx_1[];
+extern FAME_ALIGNED short const _mmx_cos[];
+extern FAME_ALIGNED short const _mmx_icos[];
+
static void inline idct_aan_pass(dct_t * block)
{
// register unsigned short const *mmx_icos = _mmx_icos;
@@ -65,9 +69,9 @@ static void inline idct_aan_pass(dct_t *
block[row*8+6] = v45; - v71, v11, v44, v65, v24 -
*/
"psllw $0x02, %%mm5\n" /* adjust v22 for multiply */
- "paddw " ASMSYM "_mmx_1, %%mm5\n" /* + 1 for rounding */
+ "paddw (%2), %%mm5\n" /* + 1 for rounding */
// "pmulhw 8(%1), %%mm5\n" /* 4*v15*ICOS4/4 -> mm5 (v23) */
- "pmulhw " ASMSYM "_mmx_icos+8, %%mm5\n" /* 4*v15*ICOS4/4 -> mm5 (v23)*/
+ "pmulhw 8(%3), %%mm5\n" /* 4*v15*ICOS4/4 -> mm5 (v23)*/
"psubsw %%mm4, %%mm5\n" /* v23 - v62 -> mm5 (v24) */
"movq %%mm3, %%mm6\n" /* v44 -> mm6 */
"paddsw %%mm5, %%mm6\n" /* v44 + v24 -> mm6 (v45) */
@@ -125,25 +129,25 @@ static void inline idct_aan_pass(dct_t *
block[row*8+4] += v55; - -
*/
"psllw $0x02, %%mm0\n" /* adjust v12 for multiply */
- "paddw " ASMSYM "_mmx_1, %%mm0\n" /* + 1 for rounding */
+ "paddw (%2), %%mm0\n" /* + 1 for rounding */
// "pmulhw 8(%1), %%mm0\n" /* 4*v12*ICOS4/4 -> mm0 (v13) */
- "pmulhw " ASMSYM "_mmx_icos+8, %%mm0\n" /* 4*v12*ICOS4/4 -> mm0 (v13) */
+ "pmulhw 8(%3), %%mm0\n" /* 4*v12*ICOS4/4 -> mm0 (v13) */
"movq %%mm2, %%mm6\n" /* v51 -> mm6 */
"psubsw %%mm1, %%mm6\n" /* v51 - v71 -> mm6 (va2) */
"psllw $0x03, %%mm2\n" /* adjust v51 for multiply */
- "paddw " ASMSYM "_mmx_1, %%mm2\n" /* + 1 for rounding */
+ "paddw (%2), %%mm2\n" /* + 1 for rounding */
/* should add another one here but it seems to look better without */
// "pmulhw 16(%1), %%mm2\n" /* 8*v51*ICOS6/8 -> mm2 (v53) */
- "pmulhw " ASMSYM "_mmx_icos+16, %%mm2\n" /* 8*v51*ICOS6/8 -> mm2 (v53) */
+ "pmulhw 16(%3), %%mm2\n" /* 8*v51*ICOS6/8 -> mm2 (v53) */
"psllw $0x02, %%mm1\n" /* adjust v71 for multiply */
- "paddw " ASMSYM "_mmx_1, %%mm1\n" /* + 1 for rounding */
+ "paddw (%2), %%mm1\n" /* + 1 for rounding */
/* should add another one here but it seems to look better without */
// "pmulhw 0(%1), %%mm1\n" /* 4*v71*ICOS2/4 -> mm1 (v73) */
- "pmulhw " ASMSYM "_mmx_icos, %%mm1\n" /* 4*v71*ICOS2/4 -> mm1 (v73) */
+ "pmulhw (%3), %%mm1\n" /* 4*v71*ICOS2/4 -> mm1 (v73) */
"psllw $0x01, %%mm6\n" /* adjust va2 for multiply */
- "paddw " ASMSYM "_mmx_1, %%mm6\n" /* + 1 for rounding */
+ "paddw (%2), %%mm6\n" /* + 1 for rounding */
// "pmulhw 24(%1), %%mm6\n" /* 2*v12*ICOS8/2 -> mm6 (va3) */
- "pmulhw " ASMSYM "_mmx_icos+24, %%mm6\n" /* 2*v12*ICOS8/2 -> mm6 (va3) */
+ "pmulhw 24(%3), %%mm6\n" /* 2*v12*ICOS8/2 -> mm6 (va3) */
"psubsw %%mm6, %%mm2\n" /* v53 - va3 -> mm2 (v54) */
"psubsw %%mm6, %%mm1\n" /* v73 - va3 -> mm1 (v74) */
"psubsw %%mm3, %%mm1\n" /* v74 - v32 -> mm3 (v75) */
@@ -167,8 +171,8 @@ static void inline idct_aan_pass(dct_t *
"paddsw %%mm0, %%mm7\n" /* v65 + v55 -> mm7 */
"movq %%mm6, 0x30(%0)\n" /* mm6 -> line 3 */
"movq %%mm7, 0x40(%0)\n" /* mm7 -> line 4 */
- : "=r"(block)/*, "=r"(mmx_icos)*/
- : "0"(block)/*, "1"(mmx_icos)*/
+ : "=r"(block)
+ : "0"(block), "r"(_mmx_1), "r"(_mmx_icos)
: "memory");
}
|