; Copyright 1996 Acorn Computers Ltd ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. ; ; JPEG Colour conversion facilities ; started 24-Sep-93 WRS ; called from c.rojpeg - C equivalents of these functions can also be found there. ; ****************************************************************************** ; * * ; * Monochrome colour conversion * ; * * ; ****************************************************************************** ; Do descaling etc. for a mono pixel MACRO MonoConv $r ADD $r,$r,#1:SHL:18 ; ready to cut off 19 bits ADDS $r,r12,$r,ASR #19 ; add 128, cut off 19 bits MOVLT $r,#0 ; if result < 0, cut off at 0 CMP $r,#255 MOVGT $r,#255 ; if result > 255, cut off at 255. ORR $r,$r,$r,LSL #8 ; replicate grey value in each of R, G, B ORR $r,$r,$r,LSL #8 MEND ; Do descaling etc. for a mono pixel MACRO MonoConv8 $r ADD $r,$r,#1:SHL:18 ; ready to cut off 19 bits ADDS $r,r12,$r,ASR #19 ; add 128, cut off 19 bits MOVLT $r,#0 ; if result < 0, cut off at 0 CMP $r,#255 MOVGT $r,#255 ; if result > 255, cut off at 255. MEND ; extern void asm_mono_convert_block(JBLOCK jblock, int *outptr, int outoffset) ; /* Convert greyscale image into 32bit RBG values. */ ; r0 = jblock - in row order, values that need descaling ; r1 = outptr - in column order, put RGB values here ; r2 = outoffset - distance (in words) between output rows. ; r11 = col counter. ; We LDR from the block and STM to the output rather than visa versa, ; because a long sequence of STRs clogs up the write buffer and is slow. asm_mono_convert_block STMDB sp!,{r0-r12,lr} ; save state MOV r11,#8 ; col counter MOV r12,#128 ; required constant jc_mono_loop LDR r3,[r0,#8*4*0] ; get a whole row LDR r4,[r0,#8*4*1] ; interleave order hopes to help ARM8! MonoConv r3 LDR r5,[r0,#8*4*2] MonoConv r4 LDR r6,[r0,#8*4*3] MonoConv r5 LDR r7,[r0,#8*4*4] MonoConv r6 LDR r8,[r0,#8*4*5] MonoConv r7 LDR r9,[r0,#8*4*6] MonoConv r8 LDR r10,[r0,#8*4*7] MonoConv r9 MonoConv r10 STMIA r1,{r3-r10} ; store a row ADD r1,r1,r2,LSL #2 ; add row offset to output pointer ADD r0,r0,#4 ; advance input pointer SUBS r11,r11,#1 ; outer loop BNE jc_mono_loop LDMIA sp!,{r0-r12,pc} ; return ; extern void asm_mono_convert_block_8(JBLOCK jblock, int *outptr, int outoffset) ; /* Convert greyscale image into 8bit Grey values. */ ; r0 = jblock - in row order, values that need descaling ; r1 = outptr - in column order, put grey values here ; r2 = outoffset - distance (in words) between output rows. ; r11 = col counter. ; We LDR from the block and STM to the output rather than visa versa, ; because a long sequence of STRs clogs up the write buffer and is slow. asm_mono_convert_block_8 STMDB sp!,{r0-r12,lr} ; save state MOV r11,#8 ; col counter MOV r12,#128 ; required constant jc_mono_loop8 LDR r3,[r0,#8*4*0] ; get a whole row LDR r4,[r0,#8*4*1] ; interleave order hopes to help ARM8! MonoConv8 r3 LDR r5,[r0,#8*4*2] MonoConv8 r4 ORR r3,r3,r4,LSL #8 LDR r6,[r0,#8*4*3] MonoConv8 r5 ORR r3,r3,r5,LSL #16 LDR r7,[r0,#8*4*4] MonoConv8 r6 ORR r3,r3,r6,LSL #24 LDR r8,[r0,#8*4*5] MonoConv8 r7 LDR r9,[r0,#8*4*6] MonoConv8 r8 ORR r7,r7,r8,LSL #8 LDR r10,[r0,#8*4*7] MonoConv8 r9 ORR r7,r7,r9,LSL #16 MonoConv8 r10 ORR r7,r7,r10,LSL #24 STMIA r1,{r3,r7} ; store a row ADD r1,r1,r2,LSL #2 ; add row offset to output pointer ADD r0,r0,#4 ; advance input pointer SUBS r11,r11,#1 ; outer loop BNE jc_mono_loop8 LDMIA sp!,{r0-r12,pc} ; return ; ****************************************************************************** ; * * ; * YUV->RGB colour conversion. * ; * * ; ****************************************************************************** ; Given four 8*8 Y blocks and one block each of U and V, create 16*16 output ; RGB pixels. ; All values in DCT blocks are scaled up by SCALEBITS bits. SCALEBITS * 19 ONE_HALF * &40000 ; combine the y value of r/g/b with the value derived from u and v. ; then normalise the result to be a value in 0..255. ; $y holds the y value, $gunsrc the value from u and v, result in $gun. MACRO NormaliseGun $op,$gun,$y,$gunsrc $op $gun,$y,$gunsrc ; R/G/B, at 19 bits MOVS $gun,$gun,ASR #SCALEBITS ; truncate MOVLT $gun,#0 CMP $gun,#255 MOVGT $gun,#255 MEND ; static void colour_convert_block(JCOEF *yuv, int *outptr, int outoffset) ; /* yuv[0..3] are Y, yuv[4] is U, yuv[5] is V. Output 16*16 colour block */ ; r0 -> the six blocks, YYYYUV (all row-ordered, ie 'wrong' way round, different from output) ; r1 -> output buffer ; r2 = offset in words between rows of output ; the output goes in outptr[row*outoffset + col] for row/col in 0..15 asm_colour_convert_block STMDB sp!,{r0-r12,lr} ; save state ; Debug gs,"in colour_convert_block" ADD r3,r1,#16*4 ; column limit pointer (inner loop) - outptr+16 words ADD r4,r1,r2,LSL #3+2 ; row limit pointer (outer loop) - outptr+8*outoffset words ADD r5,r0,#4*64*4 ; pointer into U block. V values 64 words on from this ADD r6,r4,r2,LSL #3+2 ; real row limit pointer - outptr+16*outoffset words ; The main loop goes round once for each 2*2 square of four output pixels, using ; four Y values, one U value, one V value. jc_colour_loop ; each two rows and each two columns of output ; do four output pixels, using: ; [r5] is U value ; [r5,#64*4] is V value ; [r0] is Y value for output word [r1] ; [r0,#4] is Y value for output word [r1,r2,LSL #2] ; [r0,#8*4] is Y value for output word [r1,#4] ; [r0,#8*4+4] is Y value for output word [r1,#4 + r2,LSL #2] (so to speak!) ; first we compute the values derived from U and V, which are ; true for all four pixels. LDR r7,[r5] ; U value MOV r7,r7,ASR #8 ; the multiplies will get us back to SCALEBITS again LDR r8,[r5, #64*4] ; V value MOV r8,r8,ASR #8 ; the multiplies will get us back to SCALEBITS again ; Should add 1:SHL:7 before ASR #8, but not regarded as significant enough ; considering how much bigger SCALEBITS is. ; Multiply sequences generated by cc 4.50, for 8 bits of accuracy. ; these sequences lead to a result shifted left by left by 8 bits. ;MulCon r9,r7,FIX(1.77200) ; B, without the Y yet ADD r9,r7,r7,LSL #5 RSB r9,r9,r7,LSL #8 SUB r9,r9,r7,LSL #3 ; still needs a LSL #1 ;MulCon r10,r8,FIX(1.40200) ; R, without the Y yet ADD r10,r8,r8,LSL #5 RSB r10,r10,r10,LSL #3 ADD r10,r10,r8,LSL #7 ;MulCon r11,r7,-FIX(0.34414) ADD r11,r7,r7,LSL #1 ADD r11,r11,r7,LSL #3 ; LSL #3 still needed - see below ;MulCon r12,r8,-FIX(0.71414) RSB r12,r8,r8,LSL #6 ADD r12,r12,r8,LSL #7 SUB r12,r12,r8,LSL #3 ; After those multiplies, the values are shifted up by SCALEBITS again. ; scratch r7,r8 ADD r11,r12,r11,LSL #3 ; -G, without the Y yet - did the LSL #3, see above. ; scratch r12 ; We're going to add each of r9/r10/r11 to the Y values. ; The Y values need 128 added to them - add it at this point. ; need to add a half for the truncation - do that at the same time. ; We'll be truncating at SCA MOV r7,#ONE_HALF ; construct constant - can't quite be done in one instruction. ADD r7,r7,#128:SHL:SCALEBITS ADD r9,r7,r9,LSL #1 ; LSL #1 still owed to R9 - see above. ADD r10,r10,r7 SUB r11,r11,r7 ; r11 is to be subtracted from Y, not added. ; now process the four pixels one at a time. LDMIA r0,{r7,r8} ; first two Y values, shifted up by SCALEBITS NormaliseGun ADD,r12,r7,r9 ; B NormaliseGun ADD,lr,r7,r10 ; R NormaliseGun SUB,r7,r7,r11 ; G ORR r7,lr,r7,LSL #8 ; G and R ORR r7,r7,r12,LSL #16 ; complete output pixel STR r7,[r1] ; output pixel ; Debug gs,"conp = ",r7 NormaliseGun ADD,r12,r8,r9 ; B NormaliseGun ADD,lr,r8,r10 ; R NormaliseGun SUB,r8,r8,r11 ; G ORR r8,lr,r8,LSL #8 ; G and R ORR r8,r8,r12,LSL #16 ; complete output pixel STR r8,[r1,r2,LSL #2] ; output pixel ADD r1,r1,#4 ADD r7,r0,#8*4 ; prepare to load next two pixels LDMIA r7,{r7,r8} ; other two pixels NormaliseGun ADD,r12,r7,r9 ; B NormaliseGun ADD,lr,r7,r10 ; R NormaliseGun SUB,r7,r7,r11 ; G ORR r7,lr,r7,LSL #8 ; G and R ORR r7,r7,r12,LSL #16 ; complete output pixel STR r7,[r1] ; output pixel NormaliseGun ADD,r12,r8,r9 ; B NormaliseGun ADD,lr,r8,r10 ; R NormaliseGun SUB,r8,r8,r11 ; G ORR r8,lr,r8,LSL #8 ; G and R ORR r8,r8,r12,LSL #16 ; complete output pixel STR r8,[r1,r2,LSL #2] ; output pixel ADD r1,r1,#4 ; increment pointers to go two pixels along an output row ; r1 (output pointer) already updated ADD r5,r5,#8*4 ; pointer into row-organised U,V values ADD r0,r0,#2*8*4 ; ditto for Y values ; check for end of row CMP r1,r3 ; output pointer reached end of output row? BNE jc_colour_loop ; go round for next column ; Amazingly there is not special action to take half-way along each output row, ; when we switch from one Y block to the next - because of the ordering and ; arrangement of the Y blocks, it just acts as a single 8*16 block. ; It's the end of the row. Update all input and output pointers to ; advance to next one. ADD r1,r1,r2,LSL #2+1 ; advance output ptr by two output rows SUB r1,r1,#16*4 ; ... and then to beginning of next output row. SUB r5,r5,#64*4-4 ; advance UV pointer to start of next row SUB r0,r0,#2*64*4-8 ; ditto Y pointer, but back two blocks ADD r3,r1,#16*4 ; reset column limit pointer (inner loop) - outptr+16 words ; Check for having to change to the second pair of Y blocks, or terminate CMP r1,r4 BNE jc_colour_loop ; normal case - we set off on another two rows of output ; It's either the half-way point, in which case we need to change to the second pair ; of input Y blocks, or it's the end. Test r4 against the 'real' limit pointer. CMP r4,r6 ; is this the end? LDMEQIA sp!,{r0-r12,pc} ; if so, return - nothing more to do ; We've reached the half-way point. MOV r4,r6 ; next time we test r4 and r6, exit. ADD r0,r0,#7*8*4+8*8*4 ; advance r0 from end of row 0 of block 0, ; to start of row 0 of block 2. B jc_colour_loop ; and continue. ; Performance notes for colour conversion. ; approx number of ticks for 4 pixels: ; 7 loop etc. ; 23 common work on the UV values ; 40 loading/storing,combining gun values ; 52 normalising gun values ; -- ; 122/4 = 30.5 per pixel. ; Could a register to hold &101, saves 1 per pixel. ; How common is <0 and >255? If rare, ; test all at once (easy on <0, harder for >255) ; and do a B if there's a problem. ; Not QUITE worth it, overflow about 5% of the time. ; Lookup tables don't save much - 3 instructions becomes a LDR. ; Could replace whole 'normalise' by a single LDR? Not QUITE worth it. ; A lookup for the entire computation, indexed by Y and U and V, saves quite ; a lot. BUT, 5 bits of U and V is insufficient, in smoothly shaded pictures ; (after a brief experiment). 6 bits each makes a .5MB table, too big! ; ****************************************************************************** ; * * ; * YUV->32bpp grey conversion. * ; * * ; ****************************************************************************** ; Given four 8*8 Y blocks and one block each of U and V, create 16*16 output ; RGB pixels. ; ****************************************************************************** ; * * ; * YUV->RGB colour conversion for 16bpp output * ; * * ; ****************************************************************************** ; Almost identical to the 32bpp case above, except that 16bit pixels are generated. ; An ordered dither is added to this - really only good for 1:1 plotting in 16bpp. ; Thus, a block copy can move it to the screen, and the whole thing is a great ; deal faster. ; combine the y value of r/g/b with the value derived from u and v. ; then normalise the result to be a value in 0..31. ; $y holds the y value, $gunsrc the value from u and v, result in $gun. MACRO NormaliseGun16 $op,$gun,$y,$gunsrc $op $gun,$y,$gunsrc ; R/G/B, at 19 bits MOVS $gun,$gun,ASR #SCALEBITS+3 ; truncate MOVLT $gun,#0 CMP $gun,#31 MOVGT $gun,#31 MEND ; static void colour_convert_block_16(JCOEF *yuv, short int *outptr, int outoffset) ; /* yuv[0..3] are Y, yuv[4] is U, yuv[5] is V. Output 16*16 colour block of 16bit pixels */ ; r0 -> the six blocks, YYYYUV (all row-ordered, ie 'wrong' way round, different from output) ; r1 -> output buffer ; r2 = offset in words between rows of output ; the output goes in outptr[row*outoffset + col] for row/col in 0..15 asm_colour_convert_block_16 STMDB sp!,{r0-r12,lr} ; save state ADD r3,r1,#16*2 ; column limit pointer (inner loop) - outptr+16 pixels ADD r4,r1,r2,LSL #3+2 ; row limit pointer (outer loop) - outptr+8*outoffset words ADD r5,r0,#4*64*4 ; pointer into U block. V values 64 words on from this ADD r6,r4,r2,LSL #3+2 ; real row limit pointer - outptr+16*outoffset words STMIA sp,{r4,r6} ; r4 and r6 used as temp workspace during the colour conversion: ; we never need to reload r0/r1, so use these stack locations. ; The main loop goes round once for each 2*2 square of four output pixels, using ; four Y values, one U value, one V value. jc_colour_loop16 ; each two rows and each two columns of output ; do four output pixels, using: ; [r5] is U value ; [r5,#64*4] is V value ; [r0] and [r0,#8*4] are Y values for output word [r1] ; [r0,#4] and [r0,#8*4+4] are Y values for output word [r1,r2,LSL #2] ; first we compute the values derived from U and V, which are ; true for all four pixels. LDR r7,[r5] ; U value MOV r7,r7,ASR #8 ; the multiplies will get us back to SCALEBITS again LDR r8,[r5, #64*4] ; V value MOV r8,r8,ASR #8 ; the multiplies will get us back to SCALEBITS again ; Should add 1:SHL:7 before ASR #8, but not regarded as significant enough ; considering how much bigger SCALEBITS is. ; Multiply sequences generated by cc 4.50, for 8 bits of accuracy. ; these sequences lead to a result shifted left by left by 8 bits. ;MulCon r9,r7,FIX(1.77200) ; B, without the Y yet ADD r9,r7,r7,LSL #5 RSB r9,r9,r7,LSL #8 SUB r9,r9,r7,LSL #3 ; still needs a LSL #1 ;MulCon r10,r8,FIX(1.40200) ; R, without the Y yet ADD r10,r8,r8,LSL #5 RSB r10,r10,r10,LSL #3 ADD r10,r10,r8,LSL #7 ;MulCon r11,r7,-FIX(0.34414) ADD r11,r7,r7,LSL #1 ADD r11,r11,r7,LSL #3 ; LSL #3 still needed - see below ;MulCon r12,r8,-FIX(0.71414) RSB r12,r8,r8,LSL #6 ADD r12,r12,r8,LSL #7 SUB r12,r12,r8,LSL #3 ; After those multiplies, the values are shifted up by SCALEBITS again. ; scratch r7,r8 ADD r11,r12,r11,LSL #3 ; -G, without the Y yet - did the LSL #3, see above. ; scratch r12 ; We're going to add each of r9/r10/r11 to the Y values. ; The Y values need 128 added to them - add it at this point. ; need to add a half for the truncation - do that at the same time. ; We'll be truncating at SCA ; MOV r7,#ONE_HALF ; construct constant - can't quite be done in one instruction. ; ADD r7,r7,#128:SHL:SCALEBITS MOV r7,#(ONE_HALF:SHL:3)+(128:SHL:SCALEBITS) ADD r9,r7,r9,LSL #1 ; LSL #1 still owed to R9 - see above. ADD r10,r10,r7 SUB r11,r11,r7 ; r11 is to be subtracted from Y, not added. ; now process the four pixels one at a time. LDMIA r0,{r7,r8} ; first two Y values, shifted up by SCALEBITS ADD r8,r8,#4:SHL:SCALEBITS ; ordered dither NormaliseGun16 ADD,r12,r7,r9 ; B NormaliseGun16 ADD,lr,r7,r10 ; R NormaliseGun16 SUB,r7,r7,r11 ; G ORR r7,lr,r7,LSL #5 ; G and R ORR r4,r7,r12,LSL #10 ; complete output pixel NormaliseGun16 ADD,r12,r8,r9 ; B NormaliseGun16 ADD,lr,r8,r10 ; R NormaliseGun16 SUB,r8,r8,r11 ; G ORR r8,lr,r8,LSL #5 ; G and R ORR r6,r8,r12,LSL #10 ; complete output pixel ADD r7,r0,#8*4 ; prepare to load next two pixels LDMIA r7,{r7,r8} ; other two pixels ADD r7,r7,#6:SHL:SCALEBITS ; ordered dither ADD r8,r8,#2:SHL:SCALEBITS ; ordered dither NormaliseGun16 ADD,r12,r7,r9 ; B NormaliseGun16 ADD,lr,r7,r10 ; R NormaliseGun16 SUB,r7,r7,r11 ; G ORR r7,lr,r7,LSL #5 ; G and R ORR r7,r7,r12,LSL #10 ; complete output pixel ORR r7,r4,r7,LSL #16 ; combine two pixels STR r7,[r1] ; output two pixels NormaliseGun16 ADD,r12,r8,r9 ; B NormaliseGun16 ADD,lr,r8,r10 ; R NormaliseGun16 SUB,r8,r8,r11 ; G ORR r8,lr,r8,LSL #5 ; G and R ORR r8,r8,r12,LSL #10 ; complete output pixel ORR r8,r6,r8,LSL #16 ; combine the two pixels STR r8,[r1,r2,LSL #2] ; output pixel ADD r1,r1,#4 ; increment pointers to go two pixels along an output row ; r1 (output pointer) already updated ADD r5,r5,#8*4 ; pointer into row-organised U,V values ADD r0,r0,#2*8*4 ; ditto for Y values ; check for end of row CMP r1,r3 ; output pointer reached end of output row? BNE jc_colour_loop16 ; go round for next column ; Amazingly there is not special action to take half-way along each output row, ; when we switch from one Y block to the next - because of the ordering and ; arrangement of the Y blocks, it just acts as a single 8*16 block. ; It's the end of the row. Update all input and output pointers to ; advance to next one. ADD r1,r1,r2,LSL #2+1 ; advance output ptr by two output rows SUB r1,r1,#16*2 ; ... and then to beginning of next output row. SUB r5,r5,#64*4-4 ; advance UV pointer to start of next row SUB r0,r0,#2*64*4-8 ; ditto Y pointer, but back two blocks ADD r3,r1,#16*2 ; reset column limit pointer (inner loop) - outptr+16 words ; Check for having to change to the second pair of Y blocks, or terminate LDMIA sp,{r4,r6} ; reload loop end test registers CMP r1,r4 BNE jc_colour_loop16 ; normal case - we set off on another two rows of output ; It's either the half-way point, in which case we need to change to the second pair ; of input Y blocks, or it's the end. Test r4 against the 'real' limit pointer. CMP r4,r6 ; is this the end? LDMEQIA sp!,{r0-r12,pc} ; if so, return - nothing more to do ; We've reached the half-way point. MOV r4,r6 ; next time we test r4 and r6, exit. STR r4,[sp] ; remember for final termination ADD r0,r0,#7*8*4+8*8*4 ; advance r0 from end of row 0 of block 0, ; to start of row 0 of block 2. B jc_colour_loop16 ; and continue. ; ****************************************************************************** ; * * ; * YUV->RGB colour conversion for 8bpp output * ; * * ; ****************************************************************************** [ cfsi_jpeg asm_colour_convert_block_8 ; referenced from C code, but never used. | ; Similar in structure to 32bpp and 16bpp output, except that 8bpp (VIDC1) pixels ; are generated. Partial dithering is used, so it's really only good for 1:1 plotting ; at 8bpp. ; A macro to generate an 8bpp pixel from YUV values, and subtract the actual value ; of that pixel from the YUV values. ; The yuv values are SCALEBITS up in their respective words, in the approximate range ; (0..255):SHL:SCALEBITS. ; r12 holds the lookup table (yuv->pixel, approximate) ; r11 holds the palette (pixel->yuv) MACRO Generate8bitFromYUV $y,$u,$v,$pixel,$temp,$dest MOVS $temp,$v,ASR #8-yuvtab_vbits ; get relevant bits of v MOVS $pixel,$u,ASR #8-yuvtab_ubits ; get relevant bits of v ORR $pixel,$temp,$pixel,LSL #yuvtab_vbits ; combine u and v MOVS $temp,$y,ASR #8-yuvtab_ybits ; get relevant bits of v ORR $pixel,$pixel,$temp,LSL #yuvtab_ubits+yuvtab_vbits ; combine y, u, v LDRB $pixel,[r12,$pixel] ; get the pixel value STRB $pixel,$dest ; store the pixel LDR $pixel,[r11,$pixel,LSL #2] ; get real yuv value, as bytes 0yuv AND $temp,$pixel,#&ff ; get real v value SUB $v,$v,$temp ; subtract it from v AND $temp,$pixel,#&ff00 ; get real u value SUB $u,$u,$temp,LSR #8 ; subtract it from u SUB $y,$y,$pixel,LSR #16 ; subtract it from y MEND ; Get a Y, U or V value from the given location, subtract 128 from it, ; add it to the cumulative error so far in that gun. MACRO GetColourValue $reg,$loc LDR r4,$loc ; get the value into temp register - shifted up SCALEBITS ADD r4,r4,#128:SHL:SCALEBITS ; make it in approx range 0..256 ADDS $reg,$reg,r4,ASR #SCALEBITS ; add it to error so far MOVLT $reg,#0 ; clamp CMP $reg,#255 MOVGT $reg,#255 MEND ; static void colour_convert_block_8(JCOEF *yuv, char *outptr, int outoffset) ; /* yuv[0..3] are Y, yuv[4] is U, yuv[5] is V. Output 16*16 colour block of 8bit pixels */ ; r0 -> the six blocks, YYYYUV (all row-ordered, ie 'wrong' way round, different from output) ; r1 -> output buffer ; r2 = offset in words between rows of output ; the output goes in outptr[row*outoffset + col] for row/col in 0..15 asm_colour_convert_block_8 STMDB sp!,{r0-r12,lr} ; save state ADD r3,r1,#16 ; column limit pointer (inner loop) - outptr+16 pixels ADD r4,r1,r2,LSL #3+2 ; row limit pointer (outer loop) - outptr+8*outoffset words ADD r5,r0,#4*64*4 ; pointer into U block. V values 64 words on from this ADD r6,r4,r2,LSL #3+2 ; real row limit pointer - outptr+16*outoffset words STMIA sp,{r4,r6} ; r4 and r6 used as temp workspace during the colour conversion: ; we never need to reload r0/r1, so use these stack locations. ADRL r12,yuv_to_pixel_table ADRL r11,pixel_to_yuv_table MOV r7,#0 ; cumulative error in U so far MOV r8,#0 ; cumulative error in V so far MOV r9,#0 ; cumulative error in Y so far SUB r10,r0,#2*64*4-8 ; set to NEXT y pointer row ; MOV r10,#0 ; for 4x2 cell ; The main loop goes round once for each 2*2 square of four output pixels, using ; four Y values, one U value, one V value. jc_colour_loop8 ; each two rows and each two columns of output ; do four output pixels, using: ; [r5] is U value ; [r5,#64*4] is V value ; [r0] and [r0,#8*4] are Y values for output word [r1] ; [r0,#4] and [r0,#8*4+4] are Y values for output word [r1,r2,LSL #2] GetColourValue r7,[r5] ; U value GetColourValue r8,"[r5,#64*4]" ; V value GetColourValue r9,"[r0,#4]" ; Y value Generate8bitFromYUV r9,r7,r8,r4,r6,"[r1,r2,LSL #2]" GetColourValue r7,[r5] ; U value GetColourValue r8,"[r5, #64*4]" ; V value GetColourValue r9,[r0] ; Y value Generate8bitFromYUV r9,r7,r8,r4,r6,[r1] ADD r1,r1,#1 ; Next two pixels done in a swapped order, so we do a little U around the square of four ; pixels - this seems to reduce the amount of horizontal-line effect that you get. GetColourValue r7,[r5] ; U value GetColourValue r8,"[r5,#64*4]" ; V value GetColourValue r9,"[r0,#8*4]" ; Y value Generate8bitFromYUV r9,r7,r8,r4,r6,[r1] GetColourValue r7,[r5] ; U value GetColourValue r8,"[r5,#64*4]" ; V value GetColourValue r9,"[r0,#9*4]" ; Y value Generate8bitFromYUV r9,r7,r8,r4,r6,"[r1,r2,LSL #2]" ADD r1,r1,#1 ; We have done a cell of four pixels, and have cumulative error values in Y, U, V. ; Instead of carrying all this error over to the next cell at the right, diffuse some ; of it onto the next row as indicated by r10. ; LDR r4,[r10,#8*4] ; ADD r4,r4,r9,LSL #SCALEBITS-2 ; add quarter the value in r9 ; STR r4,[r10,#8*4] ; LDR r4,[r10] ; ADD r4,r4,r9,LSL #SCALEBITS-2 ; add quarter the value in r9 ; STR r4,[r10] ; MOV r9,r9,ASR #1 ; halve what is left over ; and do it for U and V as well. CMP r10,r0 ; have we a valid r10? ; LDRNE r4,[r5,#4] ; ADDNE r4,r4,r7,LSL #SCALEBITS-1 ; MOVNE r7,r7,ASR #1 ; STRNE r4,[r5,#4] ; LDRNE r4,[r5,#64*4+4] ; ADDNE r4,r4,r8,LSL #SCALEBITS-1 ; MOVNE r8,r8,ASR #1 ; STRNE r4,[r5,#64*4+4] ; zero cumulative error on Y U V registers - for 4x2 cell ; EORS r10,r10,#1 ; MOVEQ r7,#0 ; MOVEQ r8,#0 ; MOVEQ r9,#0 ; increment pointers to go two pixels along an output row ; r1 (output pointer) already updated ADD r5,r5,#8*4 ; pointer into row-organised U,V values ADD r0,r0,#2*8*4 ; ditto for Y values ; check for end of row CMP r1,r3 ; output pointer reached end of output row? BNE jc_colour_loop8 ; go round for next column ; Amazingly there is not special action to take half-way along each output row, ; when we switch from one Y block to the next - because of the ordering and ; arrangement of the Y blocks, it just acts as a single 8*16 block. ; It's the end of the row. Update all input and output pointers to ; advance to next one. ADD r1,r1,r2,LSL #2+1 ; advance output ptr by two output rows SUB r1,r1,#16 ; ... and then to beginning of next output row. SUB r5,r5,#64*4-4 ; advance UV pointer to start of next row SUB r0,r0,#2*64*4-8 ; ditto Y pointer, but back two blocks ADD r3,r1,#16 ; reset column limit pointer (inner loop) - outptr+16 words ; zero cumulative error on Y U V registers MOV r7,#0 MOV r8,#0 MOV r9,#0 ; Set r10 to pointer to the NEXT row of Y values, or equal to r0 if there isn't ; one or we're at the half-way point (where the next row is hard to find). SUB r10,r0,#2*64*4-8 ; set to NEXT y row pointer ; Check for having to change to the second pair of Y blocks, or terminate LDMIA sp,{r4,r6} ; reload loop end test registers CMP r10,r4 ; check NEXT row pointer MOVEQ r10,r0 ; if a tricky case, discard it (>>> could do better here, and get half-way point right) CMP r1,r4 BNE jc_colour_loop8 ; normal case - we set off on another two rows of output ; It's either the half-way point, in which case we need to change to the second pair ; of input Y blocks, or it's the end. Test r4 against the 'real' limit pointer. CMP r4,r6 ; is this the end? LDMEQIA sp!,{r0-r12,pc} ; if so, return - nothing more to do ; We've reached the half-way point. MOV r4,r6 ; next time we test r4 and r6, exit. STR r4,[sp] ; remember for final termination ADD r0,r0,#7*8*4+8*8*4 ; advance r0 from end of row 0 of block 0, ; to start of row 0 of block 2. SUB r10,r0,#2*64*4-8 ; set to NEXT y row pointer B jc_colour_loop8 ; and continue. ] END