1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
|
/*
* Copyright (C) 2010 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This is a fast-and-accurate implementation of inverse Discrete Cosine
* Transform (IDCT) for ARMv6+. It also performs dequantization of the input
* coefficients just like other methods.
*
* This implementation is based on the scaled 1-D DCT algorithm proposed by
* Arai, Agui, and Nakajima. The following code is based on the figure 4-8
* on page 52 of the JPEG textbook by Pennebaker and Mitchell. Coefficients
* are (almost) directly mapped into registers.
*
* The accuracy is achieved by using SMULWy and SMLAWy instructions. Both
* multiply 32 bits by 16 bits and store the top 32 bits of the result. It
* makes 32-bit fixed-point arithmetic possible without overflow. That is
* why jpeg_idct_ifast(), which is written in C, cannot be improved.
*
* More tricks are used to gain more speed. First of all, we use as many
* registers as possible. ARM processor has 16 registers including sp (r13)
* and pc (r15), so only 14 registers can be used without limitations. In
* general, we let r0 to r7 hold the coefficients; r10 and r11 hold four
* 16-bit constants; r12 and r14 hold two of the four arguments; and r8 hold
* intermediate value. In the second pass, r9 is the loop counter. In the
* first pass, r8 to r11 are used to hold quantization values, so the loop
* counter is held by sp. Yes, the stack pointer. Since it must be aligned
* to 4-byte boundary all the time, we align it to 32-byte boundary and use
* bit 3 to bit 5. As the result, we actually use 14.1 registers. :-)
*
* Second, we rearrange quantization values to access them sequentially. The
* table is first transposed, and the new columns are placed in the order of
* 7, 5, 1, 3, 0, 2, 4, 6. Thus we can use LDMDB to load four values at a
* time. Rearranging coefficients also helps, but that requires to change a
* dozen of files, which seems not worth it. In addition, we choose to scale
* up quantization values by 13 bits, so the coefficients are scaled up by
* 16 bits after both passes. Then we can pack and saturate them two at a
* time using PKHTB and USAT16 instructions.
*
* Third, we reorder the instructions to avoid bubbles in the pipeline. This
* is done by hand accroding to the cycle timings and the interlock behavior
* described in the technical reference manual of ARM1136JF-S. We also take
* advantage of dual issue processors by interleaving instructions with
* dependencies. It has been benchmarked on four devices and all the results
* showed distinguishable improvements. Note that PLD instructions actually
* slow things down, so they are removed at the last minute. In the future,
* this might be futher improved using a system profiler.
*/
#ifdef __arm__
#include <machine/cpu-features.h>
#endif
#if __ARM_ARCH__ >= 6
// void armv6_idct(short *coefs, int *quans, unsigned char *rows, int col)
.arm
.text
.align
.global armv6_idct
.func armv6_idct
armv6_idct:
// Push everything except sp (r13) and pc (r15).
stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
// r12 = quans, r14 = coefs.
sub r4, sp, #236
bic sp, r4, #31
add r5, sp, #224
add r12, r1, #256
stm r5, {r2, r3, r4}
add r14, r0, #16
pass1_head:
// Load quantization values. (q[0, 2, 4, 6])
ldmdb r12!, {r8, r9, r10, r11}
// Load coefficients. (c[4, 1, 2, 3, 0, 5, 6, 7])
ldrsh r4, [r14, #-2] !
ldrsh r1, [r14, #16]
ldrsh r2, [r14, #32]
ldrsh r3, [r14, #48]
ldrsh r0, [r14, #64]
ldrsh r5, [r14, #80]
ldrsh r6, [r14, #96]
ldrsh r7, [r14, #112]
// r4 = q[0] * c[0];
mul r4, r8, r4
// Check if ACs are all zero.
cmp r0, #0
orreqs r8, r1, r2
orreqs r8, r3, r5
orreqs r8, r6, r7
beq pass1_zero
// Step 1: Dequantizations.
// r2 = q[2] * c[2];
// r0 = q[4] * c[4] + r4;
// r6 = q[6] * c[6] + r2;
mul r2, r9, r2
mla r0, r10, r0, r4
mla r6, r11, r6, r2
// Load quantization values. (q[7, 5, 1, 3])
ldmdb r12!, {r8, r9, r10, r11}
// r4 = r4 * 2 - r0 = -(r0 - r4 * 2);
// r2 = r2 * 2 - r6 = -(r6 - r2 * 2);
rsb r4, r0, r4, lsl #1
rsb r2, r6, r2, lsl #1
// r7 = q[7] * c[7];
// r5 = q[5] * c[5];
// r1 = q[1] * c[1] + r7;
// r3 = q[3] * c[3] + r5;
mul r7, r8, r7
mul r5, r9, r5
mla r1, r10, r1, r7
mla r3, r11, r3, r5
// Load constants.
ldrd r10, constants
// Step 2: Rotations and Butterflies.
// r7 = r1 - r7 * 2;
// r1 = r1 - r3;
// r5 = r5 * 2 - r3 = -(r3 - r5 * 2);
// r3 = r1 + r3 * 2;
// r8 = r5 + r7;
sub r7, r1, r7, lsl #1
sub r1, r1, r3
rsb r5, r3, r5, lsl #1
add r3, r1, r3, lsl #1
add r8, r5, r7
// r2 = r2 * 1.41421 = r2 * 27146 / 65536 + r2;
// r8 = r8 * 1.84776 / 8 = r8 * 15137 / 65536;
// r1 = r1 * 1.41421 = r1 * 27146 / 65536 + r1;
smlawt r2, r2, r10, r2
smulwb r8, r8, r10
smlawt r1, r1, r10, r1
// r0 = r0 + r6;
// r2 = r2 - r6;
// r6 = r0 - r6 * 2;
add r0, r0, r6
sub r2, r2, r6
sub r6, r0, r6, lsl #1
// r5 = r5 * -2.61313 / 8 + r8 = r5 * -21407 / 65536 + r8;
// r8 = r7 * -1.08239 / 8 + r8 = r7 * -8867 / 65536 + r8;
smlawt r5, r5, r11, r8
smlawb r8, r7, r11, r8
// r4 = r4 + r2;
// r0 = r0 + r3;
// r2 = r4 - r2 * 2;
add r4, r4, r2
add r0, r0, r3
sub r2, r4, r2, lsl #1
// r7 = r5 * 8 - r3 = -(r3 - r5 * 8);
// r3 = r0 - r3 * 2;
// r1 = r1 - r7;
// r4 = r4 + r7;
// r5 = r8 * 8 - r1 = -(r1 - r8 * 8);
// r7 = r4 - r7 * 2;
rsb r7, r3, r5, lsl #3
sub r3, r0, r3, lsl #1
sub r1, r1, r7
add r4, r4, r7
rsb r5, r1, r8, lsl #3
sub r7, r4, r7, lsl #1
// r2 = r2 + r1;
// r6 = r6 + r5;
// r1 = r2 - r1 * 2;
// r5 = r6 - r5 * 2;
add r2, r2, r1
add r6, r6, r5
sub r1, r2, r1, lsl #1
sub r5, r6, r5, lsl #1
// Step 3: Reorder and Save.
str r0, [sp, #-4] !
str r4, [sp, #32]
str r2, [sp, #64]
str r6, [sp, #96]
str r5, [sp, #128]
str r1, [sp, #160]
str r7, [sp, #192]
str r3, [sp, #224]
b pass1_tail
// Precomputed 16-bit constants: 27146, 15137, -21407, -8867.
// Put them in the middle since LDRD only accepts offsets from -255 to 255.
.align 3
constants:
.word 0x6a0a3b21
.word 0xac61dd5d
pass1_zero:
str r4, [sp, #-4] !
str r4, [sp, #32]
str r4, [sp, #64]
str r4, [sp, #96]
str r4, [sp, #128]
str r4, [sp, #160]
str r4, [sp, #192]
str r4, [sp, #224]
sub r12, r12, #16
pass1_tail:
ands r9, sp, #31
bne pass1_head
// r12 = rows, r14 = col.
ldr r12, [sp, #256]
ldr r14, [sp, #260]
// Load constants.
ldrd r10, constants
pass2_head:
// Load coefficients. (c[0, 1, 2, 3, 4, 5, 6, 7])
ldmia sp!, {r0, r1, r2, r3, r4, r5, r6, r7}
// r0 = r0 + 0x00808000;
add r0, r0, #0x00800000
add r0, r0, #0x00008000
// Step 1: Analog to the first pass.
// r0 = r0 + r4;
// r6 = r6 + r2;
add r0, r0, r4
add r6, r6, r2
// r4 = r0 - r4 * 2;
// r2 = r2 * 2 - r6 = -(r6 - r2 * 2);
sub r4, r0, r4, lsl #1
rsb r2, r6, r2, lsl #1
// r1 = r1 + r7;
// r3 = r3 + r5;
add r1, r1, r7
add r3, r3, r5
// Step 2: Rotations and Butterflies.
// r7 = r1 - r7 * 2;
// r1 = r1 - r3;
// r5 = r5 * 2 - r3 = -(r3 - r5 * 2);
// r3 = r1 + r3 * 2;
// r8 = r5 + r7;
sub r7, r1, r7, lsl #1
sub r1, r1, r3
rsb r5, r3, r5, lsl #1
add r3, r1, r3, lsl #1
add r8, r5, r7
// r2 = r2 * 1.41421 = r2 * 27146 / 65536 + r2;
// r8 = r8 * 1.84776 / 8 = r8 * 15137 / 65536;
// r1 = r1 * 1.41421 = r1 * 27146 / 65536 + r1;
smlawt r2, r2, r10, r2
smulwb r8, r8, r10
smlawt r1, r1, r10, r1
// r0 = r0 + r6;
// r2 = r2 - r6;
// r6 = r0 - r6 * 2;
add r0, r0, r6
sub r2, r2, r6
sub r6, r0, r6, lsl #1
// r5 = r5 * -2.61313 / 8 + r8 = r5 * -21407 / 65536 + r8;
// r8 = r7 * -1.08239 / 8 + r8 = r7 * -8867 / 65536 + r8;
smlawt r5, r5, r11, r8
smlawb r8, r7, r11, r8
// r4 = r4 + r2;
// r0 = r0 + r3;
// r2 = r4 - r2 * 2;
add r4, r4, r2
add r0, r0, r3
sub r2, r4, r2, lsl #1
// r7 = r5 * 8 - r3 = -(r3 - r5 * 8);
// r3 = r0 - r3 * 2;
// r1 = r1 - r7;
// r4 = r4 + r7;
// r5 = r8 * 8 - r1 = -(r1 - r8 * 8);
// r7 = r4 - r7 * 2;
rsb r7, r3, r5, lsl #3
sub r3, r0, r3, lsl #1
sub r1, r1, r7
add r4, r4, r7
rsb r5, r1, r8, lsl #3
sub r7, r4, r7, lsl #1
// r2 = r2 + r1;
// r6 = r6 + r5;
// r1 = r2 - r1 * 2;
// r5 = r6 - r5 * 2;
add r2, r2, r1
add r6, r6, r5
sub r1, r2, r1, lsl #1
sub r5, r6, r5, lsl #1
// Step 3: Reorder and Save.
// Load output pointer.
ldr r8, [r12], #4
// For little endian: r6, r2, r4, r0, r3, r7, r1, r5.
pkhtb r6, r6, r4, asr #16
pkhtb r2, r2, r0, asr #16
pkhtb r3, r3, r1, asr #16
pkhtb r7, r7, r5, asr #16
usat16 r6, #8, r6
usat16 r2, #8, r2
usat16 r3, #8, r3
usat16 r7, #8, r7
orr r0, r2, r6, lsl #8
orr r1, r7, r3, lsl #8
#ifdef __ARMEB__
// Reverse bytes for big endian.
rev r0, r0
rev r1, r1
#endif
// Use STR instead of STRD to support unaligned access.
str r0, [r8, r14] !
str r1, [r8, #4]
pass2_tail:
adds r9, r9, #0x10000000
bpl pass2_head
ldr sp, [sp, #8]
add sp, sp, #236
ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
bx lr
.endfunc
#endif
|