; Copyright 1996 Acorn Computers Ltd ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. ; ; hand-coded DCT ; Based on the inverse DCT in the Indep JPEG Group's release 4 code - look in c.jrevdct4. ; This C source, and the assembler output from the C (from cc 4.50) are both extremely ; useful in understanding this code. In particular CC was used to turn the multiplies ; into sequences of adds. ; 24-Sep-93 - started ; 21-Oct-93 - expanded-out-odd-cases tried and abandoned (see below). ; 13-Jan-94 - Spitfire bug found, thanx a million Mark Taunton. ; a 1D DCT takes 8 integers, adds and subtracts them, gives you 8 other integers. ; a 2D DCT does this to all the rows, then to all the columns (ie 16 times). ; The blocks (of 64 integers) are initially quite sparse, but they soon fill up. ; If the inputs (of a 1D DCT) are all zero, the outputs are all zero. ; If only the first arg is nonzero, the outputs are all replicas of the first arg. ; In this code for the 1D DCT components 0,2,4,6 are combined, then 1,3,5,7, then ; the results folded together. I have tried expanding out the handling of negative ; components so that there are four special cases for only one negative arg nonzero, ; but it made the result slower! Probably through destroying cache locality in the ; whole system. I've seen code for MPEG, which has the same problem to solve, which ; does this far more - a win on machines with a large cache or no cache. Also, ; maybe MPEG typically has sparser DCTs to perform? ; This DCT works to 8 bits of precision, whereas IJG4 uses 13. I have compared ; the results side-by-side on a 32bpp screen, and I can't see any significant ; differences. JPEG has already thrown away (sorry, 'quantized') all the numbers ; a lot anyway. ; ------------------------------------------------------------------------ ; The DCT macro ; ------------------------------------------------------------------------ ; The basic 1-D DCT is coded here as a macro, because the code is replicated ; for rows and for columns. ; There are various basic paths through a 1D DCT in this code: ; all the values are zero - nothing need be done. (common, esp in 1st pass) ; only r0 is non-zero - replicate it (also common, ditto) ; r1/r3/r5/r7 are zero, others are non-zero ; r0/r2/r4/r6 are zero, others are non-zero MACRO DCTransform $rc ; the even components. ORRS r10,r2,r6 BEQ $rc._zr26 ; B if r2/6 are zero ADD r10,r2,r6 ;MulCon r11,r10,FIX_0_541196100 ; z1 RSB r11,r10,r10,LSL #4 ADD r11,r11,r11,LSL #3 ADD r11,r11,r10,LSL #2 ;MulCon r10,r6,- FIX_1_847759065 SUB r10,r6,r6,LSL #5 RSB r10,r10,r10,LSL #4 ; SUB r10,r11,r10,LSL #3 ; tmp2 SUB r10,r10,r6,LSL #3 ADD r10,r10,r11 ; tmp2 ;MulCon r12,r2,FIX_0_765366865 ADD r12,r2,r2,LSL #4 ADD r12,r12,r2,LSL #5 ADD r12,r11,r12,LSL #2 ; tmp3 ; scratch z1=r11 ; scratch r2,r6 SUB r11,r0,r4 ; tmp1 (not shifted) ADD r6,r10,r11,LSL #8 ; tmp11 = tmp1 + tmp2; RSB r2,r10,r11,LSL #8 ; tmp12 = tmp1 - tmp2; ADD r11,r0,r4 ; tmp0 (not shifted) ; scratch r0,r4 ADD r0,r12,r11,LSL #8 ; tmp10 = tmp0 + tmp3; RSB r4,r12,r11,LSL #8 ; tmp13 = tmp0 - tmp3; ; scratch r11=tmp0 ; scratch r10,r11,r12 ; r0/2/4/6 have been replaced by tmp10/12/13/11 ; the odd components ORRS r10,r1,r3 ORREQS r10,r5,r7 BEQ $rc._odd_shortcut ; shortcut $rc._odd ; the pressure on registers is severe: do z3,z4,z5 so that ; z5 can be discarded before doing anything with z1,z2 ADD r10,r7,r3 ; z3 ADD r11,r5,r1 ; z4 ADD r12,r10,r11 ; ...z3+z4 ;MulCon lr,r12,FIX_1_175875602 ; z5 ADD lr,r12,r12,LSL #5 ADD lr,lr,lr,LSL #3 ADD lr,lr,r12,LSL #2 ; scratch r12 ;MulCon r12,r10,- FIX_1_961570560 ; z3 SUB r12,r10,r10,LSL #1 RSB r12,r12,r12,LSL #8 ADD r12,r12,r10,LSL #2 ; (still needs to <<1) ; scratch r10 ;MulCon r10,r11,- FIX_0_390180644 ; z4 SUB r10,r11,r11,LSL #1 ADD r10,r10,r10,LSL #5 ADD r10,r10,r11,LSL #3 ; (still needs to <<2) ; RSB r10,r10,r10,LSL #8 ; ADD r10,r10,r11,LSL #2 ; (still needs to <<2) ; scratch r11 ADD r12,lr,r12,LSL #1 ; z3 += z5; (did the <<1 from above) ADD r10,lr,r10,LSL #2 ; z4 += z5; (did the <<2 from above) ; scratch lr=z5 ; now do z1 all the way through before doing z2. ; registers: r12=z3 ; r10=z4 ; r11,lr scratch ADD r11,r7,r1 ; z1 ; prevent 'macro too big' errors! MEND MACRO DCTransform2 $rc ;MulCon lr,r7,FIX_0_298631336 ; tmp0 ADD lr,r7,r7,LSL #1 ADD lr,lr,r7,LSL #4 ; (still need to do <<2) ; scratch r7 ;MulCon r7,r1,FIX_1_501321110 ; tmp3 ADD r7,r1,r1,LSL #1 ; (still need to <<7) ; scratch r1 ;MulCon r1,r11,- FIX_0_899976223 ; z1 ADD r1,r11,r11,LSL #2 ADD r1,r1,r11,LSL #3 SUB r1,r1,r11,LSL #7 ; (still need to <<1) ; scratch r11 ADD lr,r12,lr,LSL #2 ; tmp0 += z3; (did the <<2 for tmp0 above) ADD lr,lr,r1,LSL #1 ; tmp0 += z1; (did the <<1 for z1 above) ADD r7,r10,r7,LSL #7 ; tmp3 += z4; (did the <<7 for tmp3 above) ADD r7,r7,r1,LSL #1 ; tmp3 += z1; (did the <<1 for z1 above) ; scratch r1 ; now we can do z2 ; registers: r7=tmp3 ; r10=z4 ; r12=z3 ; lr=tmp0 ; r1,r11 scratch ADD r1,r5,r3 ; z2 ;MulCon r11,r1,- FIX_2_562915447 ; z2 SUB r11,r1,r1,LSL #1 ADD r11,r11,r11,LSL #5 SUB r11,r11,r1,LSL #3 ; (still need to <<4) ; scratch r1 ADD r10,r10,r11,LSL #4 ; ...z2+z4 (did the <<4 above) ADD r12,r12,r11,LSL #4 ; ...z2+z3 (did the <<4 above) ; scratch r11 ;MulCon r11,r5,FIX_2_053119869 ; tmp1 RSB r11,r5,r5,LSL #3 ADD r11,r11,r5,LSL #8 ; (still need to <<1) ; scratch r5 ;MulCon r5,r3,FIX_3_072711026 ; tmp2 ADD r5,r3,r3,LSL #8 ADD r5,r5,r5,LSL #1 ADD r5,r5,r3,LSL #4 ; scratch r3 ADD r11,r10,r11,LSL #1 ; tmp1 += z2 + z4; (did the <<1 on tmp1 above) ADD r10,r5,r12 ; tmp2 += z2 + z3; ; final combination and output stage: ; r0 = tmp10 ; r1 = (scratch) ; r2 = tmp12 ; r3 = (scratch) ; r4 = tmp13 ; r5 = (scratch) ; r6 = tmp11 ; r7 = tmp3 ; r8 = loop counter ; r9 = data pointer ; r10= tmp2 ; r11= tmp1 ; r12= (scratch) ; r13= sp ; lr = tmp0 ADD r1,r6,r10 ; tmp11 + tmp2 ADD r3,r4,lr ; tmp13 + tmp0 SUB r5,r2,r11 ; tmp12 - tmp1 SUB r4,r4,lr ; tmp13 - tmp0 ADD r2,r2,r11 ; tmp12 + tmp1 SUB r6,r6,r10 ; tmp11 - tmp2 ADD r0,r0,r7 ; tmp10 + tmp3 SUB r7,r0,r7,LSL #1 ; tmp10 - tmp3 (actually tmp10 + tmp3 - 2*tmp3) $rc._store MEND ; The 'nullsave' argument tells the macro if the answer goes back in the same ; place as the input came from - TRUE for rows, FALSE for columns. If TRUE and ; all inputs zero, don't even have to save outputs. MACRO DCTransform_Leftovers $rc,$nullsave $rc._zr26 ; r2/6 are zero - we can take a shortcut through the first quadrant. ; tmp2 and tmp3 are zero, tmp0=(r0+r4)<<8, tmp1=(r0-r4)<<8. ; we do this rather than replicate-DC because it's little slower, ; especially considering we only did one instruction of test to get here, ; and handles one more case. SUB r6,r0,r4 MOV r6,r6,LSL #8 ; tmp11 = tmp1 + tmp2; MOV r2,r6 ; tmp12 = tmp1 - tmp2; ADD r0,r0,r4 MOV r0,r0,LSL #8 ; tmp10 = tmp0 + tmp3; MOV r4,r0 ; tmp13 = tmp0 - tmp3; ; We also check for r1-r7 being zero (so replicate r0 << 8), or ; r0-r7 being zero in which case nothing to be done at all. ORRS r10,r1,r3 ORREQS r10,r5,r7 BNE $rc._odd ; odd coeffs NOT all zero, handle them normally. [ $nullsave. ; If r0-r7 are all zero, AND we're saving the answer back to the same ; place, then don't even bother to save. But if answer goes somewhere ; different, we must save - this was the 'spitfire' bug. ; 1-3,5-7 are all zero - worth checking for 1-7 zero, and for all zero. ORRS r10,r0,r2 ; r0+r4,r0-r4 both zero -> r0,r4 were both zero BEQ $rc._inc ; r0-r7 are all zero - don't even save ] ; code for 1-7 zero, works for any case where 1/3/5/7 are zero in fact. $rc._odd_shortcut MOV r7,r0 MOV r1,r6 MOV r5,r2 MOV r3,r4 ; it's as if you B'd to $rc._store here. ; the caller of this macro will replicate the end-of-loop code, ; or put in the suitable B. MEND ;; ------------------------------------------------------------------------ ;; Test proc - procedure to do a 1-D DCT ;; ------------------------------------------------------------------------ ;; extern void dct_1d(decompress_info_ptr cinfo, int *data); ;asm_dct_1_d ; STMDB sp!,{r0-r12,lr} ; save state ; MOV r9,r1 ; data pointer ; LDMIA r9,{r0-r7} ; get this row ; DCTransform dct1d ; DCTransform2 dct1d ; STMIA r9!,{r0-r7} ; put this row ;dct1d_inc ; LDMIA sp!,{r0-r12,pc} ; exit ; DCTransform_Leftovers dct1d,{TRUE} ; STMIA r9!,{r0-r7} ; put this row ; LDMIA sp!,{r0-r12,pc} ; exit ; ------------------------------------------------------------------------ ; The main routine ; ------------------------------------------------------------------------ ; r0=cinfo (unused) ; r1=data ptr ; r2=count asm_j_rev_dct ; extern void asm_j_rev_dct(decompress_info_ptr cinfo, DCTBLOCK data, int count); CMP r2,#0 ; if count=0, do nothing MOVLE pc,lr STMDB sp!,{r1,r2,r4-r12,lr} ; save state - count is at [sp,#4], r3 need not be preserved dct_loop ; for each block MOV r8,#8 ; loop counter for rows MOV r9,r1 ; data pointer ; First do the rows dctrow_loop LDMIA r9,{r0-r7} ; get this row DCTransform dctrow DCTransform2 dctrow STMIA r9,{r0-r7} ; put this row dctrow_inc ADD r9,r9,#8*4 ; increment output pointer SUBS r8,r8,#1 BNE dctrow_loop dctrow_done ; now to do the columns, very similar but the loads/stores are spaced ; out more. ; LDR r9,[sp] ; reload data pointer SUB r9,r9,#8*8*4 ; reset data pointer SUB r8,r9,#8*8*4 ; set up output pointer dctcol_loop ; load the data values, spaced 8 words apart. LDR r0,[r9,#8*4*0] LDR r1,[r9,#8*4*1] LDR r2,[r9,#8*4*2] LDR r3,[r9,#8*4*3] LDR r4,[r9,#8*4*4] LDR r5,[r9,#8*4*5] LDR r6,[r9,#8*4*6] LDR r7,[r9,#8*4*7] DCTransform dctcol DCTransform2 dctcol STMIA r8,{r0-r7} ; store in row order, in the block below the original input dctcol_inc ADD r8,r8,#8*4 ; advance output pointer ADD r9,r9,#4 ; advance input pointer SUB r10,r9,r8 ; terminate based on the difference between them CMP r10,#8*4 ; until then the difference will be bigger BGT dctcol_loop ; loop LDMIA sp!,{r1,r2} ; get block pointer, count back SUBS r2,r2,#1 ; decrement count LDMLEIA sp!,{r4-r12,pc} ; exit ; 'count' says that we do another block ADD r1,r1,#64*4 ; advance pointer to next block STMDB sp!,{r1,r2} ; push pointer and count back on stack B dct_loop ; and do another block ; -------------------------------------------------------------------------- DCTransform_Leftovers dctrow,{TRUE} STMIA r9!,{r0-r7} ; store this row SUBS r8,r8,#1 ; decrement row count BNE dctrow_loop ; and either do another row B dctrow_done ; or done all rows DCTransform_Leftovers dctcol,{FALSE} B dctcol_store ; B to code for end of column END