; Copyright 1996 Acorn Computers Ltd
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;     http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
; JPEG Colour conversion facilities
; started 24-Sep-93 WRS
; called from c.rojpeg - C equivalents of these functions can also be found there.

; ******************************************************************************
; *                                                                            *
; *   Monochrome colour conversion                                             *
; *                                                                            *
; ******************************************************************************

        ; Do descaling etc. for a mono pixel
        MACRO
        MonoConv $r
        ADD     $r,$r,#1:SHL:18          ; ready to cut off 19 bits
        ADDS    $r,r12,$r,ASR #19        ; add 128, cut off 19 bits
        MOVLT   $r,#0                    ; if result < 0, cut off at 0
        CMP     $r,#255
        MOVGT   $r,#255                  ; if result > 255, cut off at 255.
        ORR     $r,$r,$r,LSL #8          ; replicate grey value in each of R, G, B
        ORR     $r,$r,$r,LSL #8
        MEND

        ; Do descaling etc. for a mono pixel
        MACRO
        MonoConv8 $r
        ADD     $r,$r,#1:SHL:18          ; ready to cut off 19 bits
        ADDS    $r,r12,$r,ASR #19        ; add 128, cut off 19 bits
        MOVLT   $r,#0                    ; if result < 0, cut off at 0
        CMP     $r,#255
        MOVGT   $r,#255                  ; if result > 255, cut off at 255.
        MEND

; extern void asm_mono_convert_block(JBLOCK jblock, int *outptr, int outoffset)
; /* Convert greyscale image into 32bit RBG values. */
;   r0 = jblock - in row order, values that need descaling
;   r1 = outptr - in column order, put RGB values here
;   r2 = outoffset - distance (in words) between output rows.
;   r11 = col counter.
; We LDR from the block and STM to the output rather than visa versa,
; because a long sequence of STRs clogs up the write buffer and is slow.
asm_mono_convert_block
        STMDB   sp!,{r0-r12,lr}             ; save state
        MOV     r11,#8                      ; col counter
        MOV     r12,#128                    ; required constant

jc_mono_loop
        LDR     r3,[r0,#8*4*0]              ; get a whole row
        LDR     r4,[r0,#8*4*1]              ; interleave order hopes to help ARM8!
        MonoConv r3
        LDR     r5,[r0,#8*4*2]
        MonoConv r4
        LDR     r6,[r0,#8*4*3]
        MonoConv r5
        LDR     r7,[r0,#8*4*4]
        MonoConv r6
        LDR     r8,[r0,#8*4*5]
        MonoConv r7
        LDR     r9,[r0,#8*4*6]
        MonoConv r8
        LDR     r10,[r0,#8*4*7]
        MonoConv r9
        MonoConv r10

        STMIA   r1,{r3-r10}                 ; store a row
        ADD     r1,r1,r2,LSL #2             ; add row offset to output pointer
        ADD     r0,r0,#4                    ; advance input pointer
        SUBS    r11,r11,#1                  ; outer loop
        BNE     jc_mono_loop

        LDMIA   sp!,{r0-r12,pc}             ; return

; extern void asm_mono_convert_block_8(JBLOCK jblock, int *outptr, int outoffset)
; /* Convert greyscale image into 8bit Grey values. */
;   r0 = jblock - in row order, values that need descaling
;   r1 = outptr - in column order, put grey values here
;   r2 = outoffset - distance (in words) between output rows.
;   r11 = col counter.
; We LDR from the block and STM to the output rather than visa versa,
; because a long sequence of STRs clogs up the write buffer and is slow.
asm_mono_convert_block_8
        STMDB   sp!,{r0-r12,lr}             ; save state
        MOV     r11,#8                      ; col counter
        MOV     r12,#128                    ; required constant

jc_mono_loop8
        LDR     r3,[r0,#8*4*0]              ; get a whole row
        LDR     r4,[r0,#8*4*1]              ; interleave order hopes to help ARM8!
        MonoConv8 r3
        LDR     r5,[r0,#8*4*2]
        MonoConv8 r4
        ORR     r3,r3,r4,LSL #8
        LDR     r6,[r0,#8*4*3]
        MonoConv8 r5
        ORR     r3,r3,r5,LSL #16
        LDR     r7,[r0,#8*4*4]
        MonoConv8 r6
        ORR     r3,r3,r6,LSL #24
        LDR     r8,[r0,#8*4*5]
        MonoConv8 r7
        LDR     r9,[r0,#8*4*6]
        MonoConv8 r8
        ORR     r7,r7,r8,LSL #8
        LDR     r10,[r0,#8*4*7]
        MonoConv8 r9
        ORR     r7,r7,r9,LSL #16
        MonoConv8 r10
        ORR     r7,r7,r10,LSL #24

        STMIA   r1,{r3,r7}                  ; store a row
        ADD     r1,r1,r2,LSL #2             ; add row offset to output pointer
        ADD     r0,r0,#4                    ; advance input pointer
        SUBS    r11,r11,#1                  ; outer loop
        BNE     jc_mono_loop8

        LDMIA   sp!,{r0-r12,pc}             ; return

; ******************************************************************************
; *                                                                            *
; *   YUV->RGB colour conversion.                                              *
; *                                                                            *
; ******************************************************************************

; Given four 8*8 Y blocks and one block each of U and V, create 16*16 output
; RGB pixels.

; All values in DCT blocks are scaled up by SCALEBITS bits.
SCALEBITS * 19
ONE_HALF * &40000

; combine the y value of r/g/b with the value derived from u and v.
; then normalise the result to be a value in 0..255.
; $y holds the y value, $gunsrc the value from u and v, result in $gun.
        MACRO
        NormaliseGun $op,$gun,$y,$gunsrc
        $op     $gun,$y,$gunsrc             ; R/G/B, at 19 bits
        MOVS    $gun,$gun,ASR #SCALEBITS    ; truncate
        MOVLT   $gun,#0
        CMP     $gun,#255
        MOVGT   $gun,#255
        MEND

; static void colour_convert_block(JCOEF *yuv, int *outptr, int outoffset)
; /* yuv[0..3] are Y, yuv[4] is U, yuv[5] is V. Output 16*16 colour block */
;   r0 -> the six blocks, YYYYUV (all row-ordered, ie 'wrong' way round, different from output)
;   r1 -> output buffer
;   r2 = offset in words between rows of output
; the output goes in outptr[row*outoffset + col] for row/col in 0..15
asm_colour_convert_block
        STMDB   sp!,{r0-r12,lr}             ; save state
;        Debug   gs,"in colour_convert_block"
        ADD     r3,r1,#16*4                 ; column limit pointer (inner loop) - outptr+16 words
        ADD     r4,r1,r2,LSL #3+2           ; row limit pointer (outer loop) - outptr+8*outoffset words
        ADD     r5,r0,#4*64*4               ; pointer into U block. V values 64 words on from this
        ADD     r6,r4,r2,LSL #3+2           ; real row limit pointer - outptr+16*outoffset words

; The main loop goes round once for each 2*2 square of four output pixels, using
; four Y values, one U value, one V value.
jc_colour_loop                              ; each two rows and each two columns of output

        ; do four output pixels, using:
        ; [r5] is U value
        ; [r5,#64*4] is V value
        ; [r0] is Y value for output word [r1]
        ; [r0,#4] is Y value for output word [r1,r2,LSL #2]
        ; [r0,#8*4] is Y value for output word [r1,#4]
        ; [r0,#8*4+4] is Y value for output word [r1,#4 + r2,LSL #2] (so to speak!)

        ; first we compute the values derived from U and V, which are
        ; true for all four pixels.
        LDR     r7,[r5]                     ; U value
        MOV     r7,r7,ASR #8                ; the multiplies will get us back to SCALEBITS again
        LDR     r8,[r5, #64*4]              ; V value
        MOV     r8,r8,ASR #8                ; the multiplies will get us back to SCALEBITS again
        ; Should add 1:SHL:7 before ASR #8, but not regarded as significant enough
        ; considering how much bigger SCALEBITS is.

        ; Multiply sequences generated by cc 4.50, for 8 bits of accuracy.
        ; these sequences lead to a result shifted left by left by 8 bits.
        ;MulCon  r9,r7,FIX(1.77200)          ; B, without the Y yet
        ADD      r9,r7,r7,LSL #5
        RSB      r9,r9,r7,LSL #8
        SUB      r9,r9,r7,LSL #3             ; still needs a LSL #1

        ;MulCon  r10,r8,FIX(1.40200)         ; R, without the Y yet
        ADD     r10,r8,r8,LSL #5
        RSB     r10,r10,r10,LSL #3
        ADD     r10,r10,r8,LSL #7

        ;MulCon  r11,r7,-FIX(0.34414)
        ADD     r11,r7,r7,LSL #1
        ADD     r11,r11,r7,LSL #3            ; LSL #3 still needed - see below

        ;MulCon  r12,r8,-FIX(0.71414)
        RSB     r12,r8,r8,LSL #6
        ADD     r12,r12,r8,LSL #7
        SUB     r12,r12,r8,LSL #3

        ; After those multiplies, the values are shifted up by SCALEBITS again.
        ; scratch r7,r8
        ADD     r11,r12,r11,LSL #3          ; -G, without the Y yet - did the LSL #3, see above.
        ; scratch r12

        ; We're going to add each of r9/r10/r11 to the Y values.
        ; The Y values need 128 added to them - add it at this point.
        ; need to add a half for the truncation - do that at the same time.
        ; We'll be truncating at SCA
        MOV     r7,#ONE_HALF                ; construct constant - can't quite be done in one instruction.
        ADD     r7,r7,#128:SHL:SCALEBITS
        ADD     r9,r7,r9,LSL #1             ; LSL #1 still owed to R9 - see above.
        ADD     r10,r10,r7
        SUB     r11,r11,r7                  ; r11 is to be subtracted from Y, not added.

        ; now process the four pixels one at a time.
        LDMIA   r0,{r7,r8}                  ; first two Y values, shifted up by SCALEBITS
        NormaliseGun ADD,r12,r7,r9          ; B
        NormaliseGun ADD,lr,r7,r10          ; R
        NormaliseGun SUB,r7,r7,r11          ; G
        ORR     r7,lr,r7,LSL #8             ; G and R
        ORR     r7,r7,r12,LSL #16           ; complete output pixel
        STR     r7,[r1]                     ; output pixel
;        Debug   gs,"conp = ",r7
        
        NormaliseGun ADD,r12,r8,r9          ; B
        NormaliseGun ADD,lr,r8,r10          ; R
        NormaliseGun SUB,r8,r8,r11          ; G
        ORR     r8,lr,r8,LSL #8             ; G and R
        ORR     r8,r8,r12,LSL #16           ; complete output pixel
        STR     r8,[r1,r2,LSL #2]           ; output pixel
        ADD     r1,r1,#4

        ADD     r7,r0,#8*4                  ; prepare to load next two pixels
        LDMIA   r7,{r7,r8}                  ; other two pixels
        NormaliseGun ADD,r12,r7,r9          ; B
        NormaliseGun ADD,lr,r7,r10          ; R
        NormaliseGun SUB,r7,r7,r11          ; G
        ORR     r7,lr,r7,LSL #8             ; G and R
        ORR     r7,r7,r12,LSL #16           ; complete output pixel
        STR     r7,[r1]                     ; output pixel

        NormaliseGun ADD,r12,r8,r9          ; B
        NormaliseGun ADD,lr,r8,r10          ; R
        NormaliseGun SUB,r8,r8,r11          ; G
        ORR     r8,lr,r8,LSL #8             ; G and R
        ORR     r8,r8,r12,LSL #16           ; complete output pixel
        STR     r8,[r1,r2,LSL #2]           ; output pixel
        ADD     r1,r1,#4

        ; increment pointers to go two pixels along an output row
        ; r1 (output pointer) already updated
        ADD     r5,r5,#8*4                  ; pointer into row-organised U,V values
        ADD     r0,r0,#2*8*4                ; ditto for Y values

        ; check for end of row
        CMP     r1,r3                       ; output pointer reached end of output row?
        BNE     jc_colour_loop              ; go round for next column

        ; Amazingly there is not special action to take half-way along each output row,
        ; when we switch from one Y block to the next - because of the ordering and
        ; arrangement of the Y blocks, it just acts as a single 8*16 block.

        ; It's the end of the row. Update all input and output pointers to
        ; advance to next one.
        ADD     r1,r1,r2,LSL #2+1           ; advance output ptr by two output rows
        SUB     r1,r1,#16*4                 ;      ... and then to beginning of next output row.
        SUB     r5,r5,#64*4-4               ; advance UV pointer to start of next row
        SUB     r0,r0,#2*64*4-8             ; ditto Y pointer, but back two blocks
        ADD     r3,r1,#16*4                 ; reset column limit pointer (inner loop) - outptr+16 words

        ; Check for having to change to the second pair of Y blocks, or terminate
        CMP     r1,r4
        BNE     jc_colour_loop              ; normal case - we set off on another two rows of output

        ; It's either the half-way point, in which case we need to change to the second pair
        ; of input Y blocks, or it's the end. Test r4 against the 'real' limit pointer.

        CMP     r4,r6                       ; is this the end?
        LDMEQIA sp!,{r0-r12,pc}             ; if so, return - nothing more to do

        ; We've reached the half-way point.
        MOV     r4,r6                       ; next time we test r4 and r6, exit.
        ADD     r0,r0,#7*8*4+8*8*4          ; advance r0 from end of row 0 of block 0,
                                            ; to start of row 0 of block 2.
        B       jc_colour_loop              ; and continue.

; Performance notes for colour conversion.
; approx number of ticks for 4 pixels:
;    7        loop etc.
;   23        common work on the UV values
;   40        loading/storing,combining gun values
;   52        normalising gun values
;   --
;  122/4 = 30.5 per pixel.

; Could a register to hold &101, saves 1 per pixel.

; How common is <0 and >255? If rare,
; test all at once (easy on <0, harder for >255)
;   and do a B if there's a problem.
; Not QUITE worth it, overflow about 5% of the time.

; Lookup tables don't save much - 3 instructions becomes a LDR.
; Could replace whole 'normalise' by a single LDR? Not QUITE worth it.
; A lookup for the entire computation, indexed by Y and U and V, saves quite
; a lot. BUT, 5 bits of U and V is insufficient, in smoothly shaded pictures
; (after a brief experiment). 6 bits each makes a .5MB table, too big!

; ******************************************************************************
; *                                                                            *
; *   YUV->32bpp grey conversion.                                              *
; *                                                                            *
; ******************************************************************************

; Given four 8*8 Y blocks and one block each of U and V, create 16*16 output
; RGB pixels.


; ******************************************************************************
; *                                                                            *
; *   YUV->RGB colour conversion for 16bpp output                              *
; *                                                                            *
; ******************************************************************************

; Almost identical to the 32bpp case above, except that 16bit pixels are generated.
; An ordered dither is added to this - really only good for 1:1 plotting in 16bpp.
; Thus, a block copy can move it to the screen, and the whole thing is a great
; deal faster.

; combine the y value of r/g/b with the value derived from u and v.
; then normalise the result to be a value in 0..31.
; $y holds the y value, $gunsrc the value from u and v, result in $gun.
        MACRO
        NormaliseGun16 $op,$gun,$y,$gunsrc
        $op     $gun,$y,$gunsrc             ; R/G/B, at 19 bits
        MOVS    $gun,$gun,ASR #SCALEBITS+3  ; truncate
        MOVLT   $gun,#0
        CMP     $gun,#31
        MOVGT   $gun,#31
        MEND

; static void colour_convert_block_16(JCOEF *yuv, short int *outptr, int outoffset)
; /* yuv[0..3] are Y, yuv[4] is U, yuv[5] is V. Output 16*16 colour block of 16bit pixels */
;   r0 -> the six blocks, YYYYUV (all row-ordered, ie 'wrong' way round, different from output)
;   r1 -> output buffer
;   r2 = offset in words between rows of output
; the output goes in outptr[row*outoffset + col] for row/col in 0..15
asm_colour_convert_block_16
        STMDB   sp!,{r0-r12,lr}             ; save state
        ADD     r3,r1,#16*2                 ; column limit pointer (inner loop) - outptr+16 pixels
        ADD     r4,r1,r2,LSL #3+2           ; row limit pointer (outer loop) - outptr+8*outoffset words
        ADD     r5,r0,#4*64*4               ; pointer into U block. V values 64 words on from this
        ADD     r6,r4,r2,LSL #3+2           ; real row limit pointer - outptr+16*outoffset words

        STMIA   sp,{r4,r6}                  ; r4 and r6 used as temp workspace during the colour conversion:
                                            ; we never need to reload r0/r1, so use these stack locations.

; The main loop goes round once for each 2*2 square of four output pixels, using
; four Y values, one U value, one V value.
jc_colour_loop16                              ; each two rows and each two columns of output

        ; do four output pixels, using:
        ; [r5] is U value
        ; [r5,#64*4] is V value
        ; [r0] and [r0,#8*4] are Y values for output word [r1]
        ; [r0,#4] and [r0,#8*4+4] are Y values for output word [r1,r2,LSL #2]

        ; first we compute the values derived from U and V, which are
        ; true for all four pixels.
        LDR     r7,[r5]                     ; U value
        MOV     r7,r7,ASR #8                ; the multiplies will get us back to SCALEBITS again
        LDR     r8,[r5, #64*4]              ; V value
        MOV     r8,r8,ASR #8                ; the multiplies will get us back to SCALEBITS again
        ; Should add 1:SHL:7 before ASR #8, but not regarded as significant enough
        ; considering how much bigger SCALEBITS is.

        ; Multiply sequences generated by cc 4.50, for 8 bits of accuracy.
        ; these sequences lead to a result shifted left by left by 8 bits.
        ;MulCon  r9,r7,FIX(1.77200)          ; B, without the Y yet
        ADD      r9,r7,r7,LSL #5
        RSB      r9,r9,r7,LSL #8
        SUB      r9,r9,r7,LSL #3             ; still needs a LSL #1

        ;MulCon  r10,r8,FIX(1.40200)         ; R, without the Y yet
        ADD     r10,r8,r8,LSL #5
        RSB     r10,r10,r10,LSL #3
        ADD     r10,r10,r8,LSL #7

        ;MulCon  r11,r7,-FIX(0.34414)
        ADD     r11,r7,r7,LSL #1
        ADD     r11,r11,r7,LSL #3            ; LSL #3 still needed - see below

        ;MulCon  r12,r8,-FIX(0.71414)
        RSB     r12,r8,r8,LSL #6
        ADD     r12,r12,r8,LSL #7
        SUB     r12,r12,r8,LSL #3

        ; After those multiplies, the values are shifted up by SCALEBITS again.
        ; scratch r7,r8
        ADD     r11,r12,r11,LSL #3          ; -G, without the Y yet - did the LSL #3, see above.
        ; scratch r12

        ; We're going to add each of r9/r10/r11 to the Y values.
        ; The Y values need 128 added to them - add it at this point.
        ; need to add a half for the truncation - do that at the same time.
        ; We'll be truncating at SCA
;        MOV     r7,#ONE_HALF                ; construct constant - can't quite be done in one instruction.
;        ADD     r7,r7,#128:SHL:SCALEBITS
        MOV     r7,#(ONE_HALF:SHL:3)+(128:SHL:SCALEBITS)
        ADD     r9,r7,r9,LSL #1             ; LSL #1 still owed to R9 - see above.
        ADD     r10,r10,r7
        SUB     r11,r11,r7                  ; r11 is to be subtracted from Y, not added.

        ; now process the four pixels one at a time.
        LDMIA   r0,{r7,r8}                  ; first two Y values, shifted up by SCALEBITS
        ADD     r8,r8,#4:SHL:SCALEBITS      ; ordered dither
        NormaliseGun16 ADD,r12,r7,r9        ; B
        NormaliseGun16 ADD,lr,r7,r10        ; R
        NormaliseGun16 SUB,r7,r7,r11        ; G
        ORR     r7,lr,r7,LSL #5             ; G and R
        ORR     r4,r7,r12,LSL #10           ; complete output pixel

        NormaliseGun16 ADD,r12,r8,r9        ; B
        NormaliseGun16 ADD,lr,r8,r10        ; R
        NormaliseGun16 SUB,r8,r8,r11        ; G
        ORR     r8,lr,r8,LSL #5             ; G and R
        ORR     r6,r8,r12,LSL #10           ; complete output pixel

        ADD     r7,r0,#8*4                  ; prepare to load next two pixels
        LDMIA   r7,{r7,r8}                  ; other two pixels
        ADD     r7,r7,#6:SHL:SCALEBITS      ; ordered dither
        ADD     r8,r8,#2:SHL:SCALEBITS      ; ordered dither
        NormaliseGun16 ADD,r12,r7,r9        ; B
        NormaliseGun16 ADD,lr,r7,r10        ; R
        NormaliseGun16 SUB,r7,r7,r11        ; G
        ORR     r7,lr,r7,LSL #5             ; G and R
        ORR     r7,r7,r12,LSL #10           ; complete output pixel
        ORR     r7,r4,r7,LSL #16            ; combine two pixels
        STR     r7,[r1]                     ; output two pixels

        NormaliseGun16 ADD,r12,r8,r9        ; B
        NormaliseGun16 ADD,lr,r8,r10        ; R
        NormaliseGun16 SUB,r8,r8,r11        ; G
        ORR     r8,lr,r8,LSL #5             ; G and R
        ORR     r8,r8,r12,LSL #10           ; complete output pixel
        ORR     r8,r6,r8,LSL #16            ; combine the two pixels
        STR     r8,[r1,r2,LSL #2]           ; output pixel
        ADD     r1,r1,#4

        ; increment pointers to go two pixels along an output row
        ; r1 (output pointer) already updated
        ADD     r5,r5,#8*4                  ; pointer into row-organised U,V values
        ADD     r0,r0,#2*8*4                ; ditto for Y values

        ; check for end of row
        CMP     r1,r3                       ; output pointer reached end of output row?
        BNE     jc_colour_loop16            ; go round for next column

        ; Amazingly there is not special action to take half-way along each output row,
        ; when we switch from one Y block to the next - because of the ordering and
        ; arrangement of the Y blocks, it just acts as a single 8*16 block.

        ; It's the end of the row. Update all input and output pointers to
        ; advance to next one.
        ADD     r1,r1,r2,LSL #2+1           ; advance output ptr by two output rows
        SUB     r1,r1,#16*2                 ;      ... and then to beginning of next output row.
        SUB     r5,r5,#64*4-4               ; advance UV pointer to start of next row
        SUB     r0,r0,#2*64*4-8             ; ditto Y pointer, but back two blocks
        ADD     r3,r1,#16*2                 ; reset column limit pointer (inner loop) - outptr+16 words

        ; Check for having to change to the second pair of Y blocks, or terminate
        LDMIA   sp,{r4,r6}                  ; reload loop end test registers
        CMP     r1,r4
        BNE     jc_colour_loop16            ; normal case - we set off on another two rows of output

        ; It's either the half-way point, in which case we need to change to the second pair
        ; of input Y blocks, or it's the end. Test r4 against the 'real' limit pointer.

        CMP     r4,r6                       ; is this the end?
        LDMEQIA sp!,{r0-r12,pc}             ; if so, return - nothing more to do

        ; We've reached the half-way point.
        MOV     r4,r6                       ; next time we test r4 and r6, exit.
        STR     r4,[sp]                     ; remember for final termination
        ADD     r0,r0,#7*8*4+8*8*4          ; advance r0 from end of row 0 of block 0,
                                            ; to start of row 0 of block 2.
        B       jc_colour_loop16            ; and continue.

; ******************************************************************************
; *                                                                            *
; *   YUV->RGB colour conversion for 8bpp output                               *
; *                                                                            *
; ******************************************************************************

      [ cfsi_jpeg
asm_colour_convert_block_8 ; referenced from C code, but never used.
      |

; Similar in structure to 32bpp and 16bpp output, except that 8bpp (VIDC1) pixels
; are generated. Partial dithering is used, so it's really only good for 1:1 plotting
; at 8bpp.

; A macro to generate an 8bpp pixel from YUV values, and subtract the actual value
; of that pixel from the YUV values.
; The yuv values are SCALEBITS up in their respective words, in the approximate range
; (0..255):SHL:SCALEBITS.
; r12 holds the lookup table (yuv->pixel, approximate)
; r11 holds the palette (pixel->yuv)
        MACRO
        Generate8bitFromYUV   $y,$u,$v,$pixel,$temp,$dest
        MOVS    $temp,$v,ASR #8-yuvtab_vbits                  ; get relevant bits of v
        MOVS    $pixel,$u,ASR #8-yuvtab_ubits                 ; get relevant bits of v
        ORR     $pixel,$temp,$pixel,LSL #yuvtab_vbits         ; combine u and v
        MOVS    $temp,$y,ASR #8-yuvtab_ybits                  ; get relevant bits of v
        ORR     $pixel,$pixel,$temp,LSL #yuvtab_ubits+yuvtab_vbits ; combine y, u, v
        LDRB    $pixel,[r12,$pixel]                           ; get the pixel value
        STRB    $pixel,$dest                                  ; store the pixel
        LDR     $pixel,[r11,$pixel,LSL #2]                    ; get real yuv value, as bytes 0yuv
        AND     $temp,$pixel,#&ff                             ; get real v value
        SUB     $v,$v,$temp                                   ; subtract it from v
        AND     $temp,$pixel,#&ff00                           ; get real u value
        SUB     $u,$u,$temp,LSR #8                            ; subtract it from u
        SUB     $y,$y,$pixel,LSR #16                          ; subtract it from y
        MEND

; Get a Y, U or V value from the given location, subtract 128 from it,
; add it to the cumulative error so far in that gun.
        MACRO
        GetColourValue $reg,$loc
        LDR     r4,$loc                                       ; get the value into temp register - shifted up SCALEBITS
        ADD     r4,r4,#128:SHL:SCALEBITS                      ; make it in approx range 0..256
        ADDS    $reg,$reg,r4,ASR #SCALEBITS                   ; add it to error so far
        MOVLT   $reg,#0                                       ; clamp
        CMP     $reg,#255
        MOVGT   $reg,#255
        MEND

; static void colour_convert_block_8(JCOEF *yuv, char *outptr, int outoffset)
; /* yuv[0..3] are Y, yuv[4] is U, yuv[5] is V. Output 16*16 colour block of 8bit pixels */
;   r0 -> the six blocks, YYYYUV (all row-ordered, ie 'wrong' way round, different from output)
;   r1 -> output buffer
;   r2 = offset in words between rows of output
; the output goes in outptr[row*outoffset + col] for row/col in 0..15
asm_colour_convert_block_8

        STMDB   sp!,{r0-r12,lr}             ; save state
        ADD     r3,r1,#16                   ; column limit pointer (inner loop) - outptr+16 pixels
        ADD     r4,r1,r2,LSL #3+2           ; row limit pointer (outer loop) - outptr+8*outoffset words
        ADD     r5,r0,#4*64*4               ; pointer into U block. V values 64 words on from this
        ADD     r6,r4,r2,LSL #3+2           ; real row limit pointer - outptr+16*outoffset words

        STMIA   sp,{r4,r6}                  ; r4 and r6 used as temp workspace during the colour conversion:
                                            ; we never need to reload r0/r1, so use these stack locations.

        ADRL    r12,yuv_to_pixel_table
        ADRL    r11,pixel_to_yuv_table
        MOV     r7,#0                       ; cumulative error in U so far
        MOV     r8,#0                       ; cumulative error in V so far
        MOV     r9,#0                       ; cumulative error in Y so far

        SUB     r10,r0,#2*64*4-8            ; set to NEXT y pointer row
;        MOV     r10,#0                      ; for 4x2 cell

; The main loop goes round once for each 2*2 square of four output pixels, using
; four Y values, one U value, one V value.
jc_colour_loop8                             ; each two rows and each two columns of output

        ; do four output pixels, using:
        ; [r5] is U value
        ; [r5,#64*4] is V value
        ; [r0] and [r0,#8*4] are Y values for output word [r1]
        ; [r0,#4] and [r0,#8*4+4] are Y values for output word [r1,r2,LSL #2]

        GetColourValue r7,[r5]              ; U value
        GetColourValue r8,"[r5,#64*4]"      ; V value
        GetColourValue r9,"[r0,#4]"         ; Y value
        Generate8bitFromYUV r9,r7,r8,r4,r6,"[r1,r2,LSL #2]"

        GetColourValue r7,[r5]              ; U value
        GetColourValue r8,"[r5, #64*4]"     ; V value
        GetColourValue r9,[r0]              ; Y value
        Generate8bitFromYUV r9,r7,r8,r4,r6,[r1]
        ADD     r1,r1,#1

        ; Next two pixels done in a swapped order, so we do a little U around the square of four
        ; pixels - this seems to reduce the amount of horizontal-line effect that you get.

        GetColourValue r7,[r5]              ; U value
        GetColourValue r8,"[r5,#64*4]"      ; V value
        GetColourValue r9,"[r0,#8*4]"       ; Y value
        Generate8bitFromYUV r9,r7,r8,r4,r6,[r1]

        GetColourValue r7,[r5]              ; U value
        GetColourValue r8,"[r5,#64*4]"      ; V value
        GetColourValue r9,"[r0,#9*4]"       ; Y value
        Generate8bitFromYUV r9,r7,r8,r4,r6,"[r1,r2,LSL #2]"
        ADD     r1,r1,#1

        ; We have done a cell of four pixels, and have cumulative error values in Y, U, V.
        ; Instead of carrying all this error over to the next cell at the right, diffuse some
        ; of it onto the next row as indicated by r10.
 ;       LDR     r4,[r10,#8*4]
 ;       ADD     r4,r4,r9,LSL #SCALEBITS-2   ; add quarter the value in r9
 ;       STR     r4,[r10,#8*4]
 ;       LDR     r4,[r10]
 ;       ADD     r4,r4,r9,LSL #SCALEBITS-2   ; add quarter the value in r9
 ;       STR     r4,[r10]
 ;       MOV     r9,r9,ASR #1                ; halve what is left over

        ; and do it for U and V as well.
        CMP     r10,r0                      ; have we a valid r10?
 ;       LDRNE   r4,[r5,#4]
 ;       ADDNE   r4,r4,r7,LSL #SCALEBITS-1
 ;       MOVNE   r7,r7,ASR #1
 ;       STRNE   r4,[r5,#4]
 ;       LDRNE   r4,[r5,#64*4+4]
 ;       ADDNE   r4,r4,r8,LSL #SCALEBITS-1
 ;       MOVNE   r8,r8,ASR #1
 ;       STRNE   r4,[r5,#64*4+4]

        ; zero cumulative error on Y U V registers - for 4x2 cell
;        EORS    r10,r10,#1
;        MOVEQ   r7,#0
;        MOVEQ   r8,#0
;        MOVEQ   r9,#0

        ; increment pointers to go two pixels along an output row
        ; r1 (output pointer) already updated
        ADD     r5,r5,#8*4                  ; pointer into row-organised U,V values
        ADD     r0,r0,#2*8*4                ; ditto for Y values

        ; check for end of row
        CMP     r1,r3                       ; output pointer reached end of output row?
        BNE     jc_colour_loop8             ; go round for next column

        ; Amazingly there is not special action to take half-way along each output row,
        ; when we switch from one Y block to the next - because of the ordering and
        ; arrangement of the Y blocks, it just acts as a single 8*16 block.

        ; It's the end of the row. Update all input and output pointers to
        ; advance to next one.
        ADD     r1,r1,r2,LSL #2+1           ; advance output ptr by two output rows
        SUB     r1,r1,#16                   ;      ... and then to beginning of next output row.
        SUB     r5,r5,#64*4-4               ; advance UV pointer to start of next row
        SUB     r0,r0,#2*64*4-8             ; ditto Y pointer, but back two blocks
        ADD     r3,r1,#16                   ; reset column limit pointer (inner loop) - outptr+16 words

        ; zero cumulative error on Y U V registers
        MOV     r7,#0
        MOV     r8,#0
        MOV     r9,#0

        ; Set r10 to pointer to the NEXT row of Y values, or equal to r0 if there isn't
        ; one or we're at the half-way point (where the next row is hard to find).
        SUB     r10,r0,#2*64*4-8            ; set to NEXT y row pointer

        ; Check for having to change to the second pair of Y blocks, or terminate
        LDMIA   sp,{r4,r6}                  ; reload loop end test registers
        CMP     r10,r4                      ; check NEXT row pointer
        MOVEQ   r10,r0                      ; if a tricky case, discard it (>>> could do better here, and get half-way point right)
        CMP     r1,r4
        BNE     jc_colour_loop8             ; normal case - we set off on another two rows of output

        ; It's either the half-way point, in which case we need to change to the second pair
        ; of input Y blocks, or it's the end. Test r4 against the 'real' limit pointer.

        CMP     r4,r6                       ; is this the end?
        LDMEQIA sp!,{r0-r12,pc}             ; if so, return - nothing more to do

        ; We've reached the half-way point.
        MOV     r4,r6                       ; next time we test r4 and r6, exit.
        STR     r4,[sp]                     ; remember for final termination
        ADD     r0,r0,#7*8*4+8*8*4          ; advance r0 from end of row 0 of block 0,
                                            ; to start of row 0 of block 2.
        SUB     r10,r0,#2*64*4-8            ; set to NEXT y row pointer
        B       jc_colour_loop8             ; and continue.

      ]

        END