diff --git a/Sources/jdhuff b/Sources/jdhuff index dd0605f131478705582e86913b550cb5ea8a45a5..97a76e52bd44349359905e1502f32c0e70411b57 100644 --- a/Sources/jdhuff +++ b/Sources/jdhuff @@ -56,7 +56,7 @@ h_l RN 2 ; used in huff_decode - same as temp3 MACRO HUFF_DECODE_SETUP $tbl ; set up the specific table pointers for ac or dc huff table - ; $tbl is a HUFF_TBL*. + ; $tbl is a JHUFF_TBL*. ADD h_maxcode,$tbl,#huff_tbl_maxcode ADD h_shortcut,$tbl,#huff_tbl_shortcut ADD h_huffval,$tbl,#huff_tbl_huffval @@ -221,16 +221,15 @@ huff_decode_loop$lab ; -------------------------------------------------------------------- EXPORT asm_huff_decode_blocks asm_huff_decode_blocks -;LOCAL void -;huff_decode_blocks (decompress_info_ptr cinfo, JBLOCK block, -; HUFF_TBL *dctbl, HUFF_TBL *actbl, -; QUANT_TBL_PTR quanttbl, int *last_dc_val, int nblocks) +;huff_decode_blocks (j_decompress_ptr cinfo, JBLOCK block, +; JHUFF_TBL *dctbl, JHUFF_TBL *actbl, +; JQUANT_TBL *quanttbl, JCOEF *last_dc_val, int nblocks) ; r0 = cinfo ; r1 = block pointer -; r2 = HUFF_TBL* dctbl -; r3 = HUFF_TBL* actbl -; [sp] = quanttbl -; [sp,#4] = int *last_dc_val +; r2 = JHUFF_TBL *dctbl +; r3 = JHUFF_TBL *actbl +; [sp,#0] = quanttbl +; [sp,#4] = JCOEF *last_dc_val ; [sp,#8] = int nblocks ; save registers @@ -283,7 +282,7 @@ huff_block_clear huff_anotherblock ; loop round to here nblocks times ; Set up huffman decoding for the DC component. - LDR h_temp,[sp,#2*4] ; HUFF_TBL* dctbl (DC table pointer) + LDR h_temp,[sp,#2*4] ; JHUFF_TBL* dctbl (DC table pointer) HUFF_DECODE_SETUP h_temp ; set maxcode,huffval,shortcut ; Handle the DC component @@ -310,7 +309,7 @@ huff_dc_0 ; That's the DC value done. ; Set up huffman decoding for the AC components. - LDR h_temp,[sp,#3*4] ; HUFF_TBL* actbl (AC table pointer) + LDR h_temp,[sp,#3*4] ; JHUFF_TBL* actbl (AC table pointer) HUFF_DECODE_SETUP h_temp ; set maxcode,huffval,shortcut ; The loop that does AC components, once round for each non-zero component. @@ -380,16 +379,15 @@ huff_zag_end ; -------------------------------------------------------------------- EXPORT asm_huff_skip_blocks asm_huff_skip_blocks -;LOCAL void -;huff_skip_blocks (decompress_info_ptr cinfo, JBLOCK block, -; HUFF_TBL *dctbl, HUFF_TBL *actbl, -; QUANT_TBL_PTR quanttbl, int *last_dc_val, int nblocks) +;huff_skip_blocks (j_decompress_ptr cinfo, JBLOCK block, +; JHUFF_TBL *dctbl, JHUFF_TBL *actbl, +; JQUANT_TBL *quanttbl, JCOEF *last_dc_val, int nblocks) ; r0 = cinfo ; r1 = block pointer (UNUSED) -; r2 = HUFF_TBL* dctbl -; r3 = HUFF_TBL* actbl -; [sp] = quanttbl (UNUSED) -; [sp,#4] = int *last_dc_val +; r2 = JHUFF_TBL *dctbl +; r3 = JHUFF_TBL *actbl +; [sp,#0] = quanttbl (UNUSED) +; [sp,#4] = JCOEF *last_dc_val ; [sp,#8] = int nblocks ; This routine is very similar to huff_decode_blocks, except that ; we do not actually output the block - we simply skip forward that far @@ -415,7 +413,7 @@ asm_huff_skip_blocks huff_skip_anotherblock ; loop round to here nblocks times ; Set up huffman decoding for the DC component. - LDR h_temp,[sp,#2*4] ; HUFF_TBL* dctbl (DC table pointer) + LDR h_temp,[sp,#2*4] ; JHUFF_TBL* dctbl (DC table pointer) HUFF_DECODE_SETUP h_temp ; set maxcode,huffval,shortcut ; Handle the DC component @@ -435,7 +433,7 @@ huff_skip_dc_0 ; That's the DC value done. ; Set up huffman decoding for the AC components. - LDR h_temp,[sp,#3*4] ; HUFF_TBL* actbl (AC table pointer) + LDR h_temp,[sp,#3*4] ; JHUFF_TBL* actbl (AC table pointer) HUFF_DECODE_SETUP h_temp ; set maxcode,huffval,shortcut ; The loop that does AC components, once round for each non-zero component. diff --git a/Sources/jrevdct b/Sources/jrevdct index 6e70826c1d3db460b8f50749155dbdac627276af..d6abfda4fbd2a8252d43f14403e157ab98c99278 100644 --- a/Sources/jrevdct +++ b/Sources/jrevdct @@ -266,7 +266,7 @@ $rc._odd_shortcut ;; ------------------------------------------------------------------------ ;; Test proc - procedure to do a 1-D DCT ;; ------------------------------------------------------------------------ -;; extern void dct_1d(decompress_info_ptr cinfo, int *data); +;; extern void dct_1d(j_decompress_ptr cinfo, int *data); ;asm_dct_1_d ; STMDB sp!,{r0-r12,lr} ; save state @@ -290,7 +290,7 @@ $rc._odd_shortcut ; r2=count EXPORT asm_j_rev_dct -asm_j_rev_dct ; extern void asm_j_rev_dct(decompress_info_ptr cinfo, DCTBLOCK data, int count); +asm_j_rev_dct ; extern void asm_j_rev_dct(j_decompress_ptr cinfo, JBLOCK data, int count); CMP r2,#0 ; if count=0, do nothing MOVLE pc,lr diff --git a/VersionASM b/VersionASM index cc446a944968d2d284c884d647c1094b5114fcc3..d3f0d9ebe2ebe8922ce1c5ede272f025d413a9aa 100644 --- a/VersionASM +++ b/VersionASM @@ -13,11 +13,11 @@ GBLS Module_ComponentPath Module_MajorVersion SETS "1.38" Module_Version SETA 138 -Module_MinorVersion SETS "" -Module_Date SETS "23 Dec 2010" -Module_ApplicationDate SETS "23-Dec-10" +Module_MinorVersion SETS "1.35.2.1" +Module_Date SETS "04 Jan 2011" +Module_ApplicationDate SETS "04-Jan-11" Module_ComponentName SETS "SprExtend" Module_ComponentPath SETS "mixed/RiscOS/Sources/Video/Render/SprExtend" -Module_FullVersion SETS "1.38" -Module_HelpVersion SETS "1.38 (23 Dec 2010)" +Module_FullVersion SETS "1.38 (1.35.2.1)" +Module_HelpVersion SETS "1.38 (04 Jan 2011) 1.35.2.1" END diff --git a/VersionNum b/VersionNum index 0aec99713561dc3470f676814000652a7cf3d31d..d6c88986ee5a47205996a142d1f5dbac7b5de6b4 100644 --- a/VersionNum +++ b/VersionNum @@ -5,19 +5,19 @@ * */ #define Module_MajorVersion_CMHG 1.38 -#define Module_MinorVersion_CMHG -#define Module_Date_CMHG 23 Dec 2010 +#define Module_MinorVersion_CMHG 1.35.2.1 +#define Module_Date_CMHG 04 Jan 2011 #define Module_MajorVersion "1.38" #define Module_Version 138 -#define Module_MinorVersion "" -#define Module_Date "23 Dec 2010" +#define Module_MinorVersion "1.35.2.1" +#define Module_Date "04 Jan 2011" -#define Module_ApplicationDate "23-Dec-10" +#define Module_ApplicationDate "04-Jan-11" #define Module_ComponentName "SprExtend" #define Module_ComponentPath "mixed/RiscOS/Sources/Video/Render/SprExtend" -#define Module_FullVersion "1.38" -#define Module_HelpVersion "1.38 (23 Dec 2010)" +#define Module_FullVersion "1.38 (1.35.2.1)" +#define Module_HelpVersion "1.38 (04 Jan 2011) 1.35.2.1" #define Module_LibraryVersionInfo "1:38" diff --git a/c/putscaled b/c/putscaled new file mode 100644 index 0000000000000000000000000000000000000000..154a13226826ad366d0a3a2910ba7a3ce05c9938 --- /dev/null +++ b/c/putscaled @@ -0,0 +1,3486 @@ +/* Copyright 2011 Castle Technology Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* c.PutScaled - the bitblit compiler for PutSpriteScaled/PlotMaskScaled */ + +#include <stdarg.h> +#include <string.h> +#include <stdio.h> +#include "swis.h" +#include "commondefs.h" +#include "jpeglib.h" +#include "putscaled.h" +#include "C:Global.h.Sprite" + +/************************************************************************** +* * +* Macros. * +* * +**************************************************************************/ + +#define SOURCE_32_BIT (wp->save_inlog2bpp == 5) +#define SOURCE_16_BIT (wp->save_inlog2bpp == 4) +#define SOURCED_16_BIT (wp->save_inlog2bpc == 4) /* like SOURCE_16_BIT but includes 16-bit double-pixels */ +#define SOURCE_MASK (ws->masked) +#define SOURCE_BPPMASK (ws->mask1bpp) +#define SOURCE_TABLE ((wp->ColourTTR != 0) || (wp->trns_palette != 0)) + +#define DPIXEL_INPUT (wp->save_inlog2bpp != wp->save_inlog2bpc) +#define DPIXEL_OUTPUT (wp->BPP != wp->BPC) + +#define PLOTMASK ((wp->spritecode & 255) == SpriteReason_PlotMaskScaled) + +#define DEST_32_BIT (wp->BPP == 32) +#define DEST_16_BIT (wp->BPP == 16) +#define DEST_1_BIT (wp->BPC == 1) +#define DESTD_16_BIT (wp->BPC == 16) /* like DEST_16_BIT but includes 16-bit double-pixels */ + +/************************************************************************** +* * +* Low-level debugging output. * +* * +**************************************************************************/ + +#ifdef DEBUG +#define tracef(args) do_sprintf(0, args) +#define assert(x, y) do_assert(__LINE__, x, y, NULL) +#define newline() tracef("\n"); +#define comment(ws,text) do_comment(text) +#define IFDEBUG(a) a +#include "tracing.c" +#else +#define tracef(args) /* Nothing */ +#define assert(x, y) {if (!(x)) exit_erl(y, __LINE__);} +#define newline() /* Nothing */ +#define comment(ws,text) /* Nothing */ +#define IFDEBUG(a) /* Nothing */ +#endif + +/************************************************************************** +* * +* JPEG handling. * +* * +**************************************************************************/ + +#ifdef ASMjpeg +#include "rojpeg.c" +#endif + +/************************************************************************** +* * +* C Workspace declarations. * +* * +**************************************************************************/ + +/* Code buffers */ +#define NBUFFERS 8 /* Number of code buffers */ +#define BUFSIZE 256 /* words per buffer */ +typedef struct +{ + int key_word; /* descriptor for this code, or -1 if empty */ + int xadd; /* precise scale factors compiled into this code */ + int xdiv; + int yadd; + int ydiv; + int outoffset; /* output row offset compiled into this code */ + int code[BUFSIZE]; /* the code itself */ +} code_buffer; +#define FOR_EACH_BUFFER(ptr) for (ptr = &ws->buffers[0]; ptr < &ws->buffers[NBUFFERS]; ptr++) + +/* Labels - there's one of these for each label in the source we generate. */ +typedef struct +{ + int *def; /* where the label is, or 0 if not yet defined. */ + int *ref; /* a reference to the label, to be filled in when it's defined. */ +#ifdef DEBUG + char *name; /* textual name of the label - same as field name */ +#endif +} label; + +/* Each label must be added as a field to this structure. */ +typedef struct +{ + #define FIRST_LABEL loop_y_repeat + label loop_y_repeat; +#ifdef TESTDEBUG + label test1; + label test2; +#endif + label loop_x_enter; + label loop_x_repeat; + label loop_x_exit; + label l_masked; + label loop_put_pixel_repeat; + label loop_put_masked_repeat; + label y_loop; + label y_loop_enter; + label y_loop_exit; + label loop_delay; + + label x_evenstart; + label x_oddmask; + label x_aligned_loop; + label x_aligned_enter; + label x_alignmask1; + label x_alignmask2; + label x_misaligned; + label x_misaligned_loop; + label x_misaligned_enter; + label x_misalignmask1; + label x_misalignmask2; + label x_2atatime_exit; + label x_lastmask; + label loop_x_exit1; + label loop_x_exitskip; + label loop1; + label loop2; + label plot_loopa; + label plot_loop1; + label plot_loop1a; + label plot_loop1b; + label plot_loop1c; + label plot_loop2; + label plot_loop3; + label plot_loop4; + label plot_loop4a; + label plot_loop4b; + label plot_loop4c; + + label last; + #define LAST_LABEL last + /* If you add a label, add giving it a name in check_workspace */ +} labels_rec; +#define FOR_EACH_LABEL(ptr) for (ptr = &ws->labels.FIRST_LABEL; ptr <= &ws->labels.LAST_LABEL; ptr++) +#define L(name) (&(ws->labels.name)) + +/* Register names - one for each register name (the register numbers are allocated at compile time) */ +typedef struct +{ + int regno; /* the physical register number */ +#ifdef DEBUG + char *name; /* the name, for trace output */ +#endif +} regname; + +/* Each register name must be added as a field to this structure. */ +typedef struct +{ + #define FIRST_REGISTER r_pixel + regname r_pixel; + regname r_inptr; + regname r_inshift; + regname r_inword; + regname r_maskinptr; + regname r_maskinword; + regname r_maskinshift; + regname r_masko; + regname r_temp1; + regname r_temp2; + regname r_c1632; + regname r_oditheradd; + regname r_blockroutine; + regname r_ecfindex; + regname r_bgcolour; + regname r_fetchroutine; + regname r_outptr; + regname r_outword; + regname r_outshift; + regname r_table; + regname r_xsize; + regname r_xcount; + regname r_ysize; + regname r_ycount; + regname r_inoffset; + regname r_maskinoffset; + regname r_in_pixmask; /* only used by 2-at-a-time loop */ + + regname r1; + regname r2; + regname r3; + regname wp; + regname sp; + regname lr; + regname pc; + #define LAST_REGISTER pc +} regnames_rec; +#define FOR_EACH_REGISTER_NAME(ptr) for (ptr = &ws->regnames.FIRST_REGISTER; ptr <= &ws->regnames.LAST_REGISTER; ptr++) + +#if 0 +#define R(reg) rr(&ws->regnames.reg) +static int rr(regname *r) +{ + /* Makes code bigger, lots of string clashes - rats! */ + if (r->regno == -1) tracef("Register %s not defined\n" _ r->name); + assert(r->regno != -1, ERROR_FATAL); + return r->regno; +} +#else +#define R(reg) rr(ws->regnames.reg.regno) +static int rr(int r) +{ + /* Assert that the register is at least set */ + assert(r != -1, ERROR_FATAL); + return r; +} +#endif + +/* The structure containing all workspace - essentially our static variables. */ +#define CHECK_CODE 123456789 +typedef struct +{ + /* Initialisation */ + int check_code; + + /* Code buffer management */ + int build_buffer; /* Buffer currently being built, or next to build */ + int *compile_base; + int *compile_ptr; /* where to put next instruction */ + int *compile_lim; + + /* Label control and allocation */ + labels_rec labels; /* each label, and where it is in the generated code */ + + /* Register control and allocation */ + regnames_rec regnames; /* physical assignment of each register name */ + int next_free_reg; /* allocator of physical registers, as they are needed. */ + BOOL leave_r12_alone; /* Leave assembler 'wp' in place during compiled code */ + + int gcol; /* GCOL action */ + BOOL masked; /* whether to use mask */ + BOOL mask1bpp; /* whether mask is 1bpp mask */ + + int odither; /* If 0, then there's no ordered dither. If non-0, number of bits - 1 being truncated by dither. */ +#if 0 + int odither_eorvalue; /* value for eor alternation along a line */ + int odither_shift; /* offset of two-bit dither value in r_oditheradd */ +#endif + + /* Assemble-time constants */ + int in_bpp; + int in_bpc; /* Same as bpp unless double-pixel, in which case double bpp */ + int in_pixmask; + int mask_bpp; + int mask_bpc; + int mask_pixmask; + int out_l2bpp; /* not provided in wp */ + int out_l2bpc; /* ditto */ + int out_pixmask; /* mask for one pixel */ + int out_dpixmask; + int out_ppw; /* pixels per word */ + int out_l2ppw; + BOOL cal_table_simple; /* If true, a simple table lookup is possible */ + + /* Space for compiled code, near the end so most field accesses have only a small offset. */ + code_buffer buffers[NBUFFERS]; + + /* Check for workspace overwritten */ + int check_code2; +} workspace; + +static void check_workspace(workspace *ws) +/* Basic validity checks, and initialise if this is the first time. */ +{ + assert(ws != 0, ERROR_NO_MEMORY); + if (ws->check_code != CHECK_CODE) + { + code_buffer *p; + tracef("Initialising workspace.\n"); + ws->check_code = CHECK_CODE; + ws->check_code2 = CHECK_CODE; + ws->build_buffer = 0; + FOR_EACH_BUFFER(p) p->key_word = -1; + +#ifdef DEBUG + { + label *l; + + /* Set up textual names of all the labels */ + FOR_EACH_LABEL(l) l->name = 0; + #define LN(lname) ws->labels.lname.name = #lname; + LN(loop_y_repeat) +#ifdef TESTDEBUG + LN(test1) + LN(test2) +#endif + LN(loop_x_enter) + LN(loop_x_repeat) + LN(loop_x_exit) + LN(l_masked) + LN(loop_put_pixel_repeat) + LN(loop_put_masked_repeat) + LN(y_loop) + LN(y_loop_enter) + LN(y_loop_exit) + LN(loop_delay) + + LN(x_evenstart) + LN(x_oddmask) + LN(x_aligned_loop) + LN(x_aligned_enter) + LN(x_alignmask1) + LN(x_alignmask2) + LN(x_misaligned) + LN(x_misaligned_loop) + LN(x_misaligned_enter) + LN(x_misalignmask1) + LN(x_misalignmask2) + LN(x_2atatime_exit) + LN(x_lastmask) + LN(loop_x_exit1) + LN(loop_x_exitskip) + LN(loop1) + LN(loop2) + LN(plot_loopa) + LN(plot_loop1) + LN(plot_loop1a) + LN(plot_loop1b) + LN(plot_loop1c) + LN(plot_loop2) + LN(plot_loop3) + LN(plot_loop4) + LN(plot_loop4a) + LN(plot_loop4b) + LN(plot_loop4c) + + LN(last) + /* Check he's got them all */ + FOR_EACH_LABEL(l) assert(l->name != 0, ERROR_FATAL); + } + { + regname *r; + + FOR_EACH_REGISTER_NAME(r) r->name = 0; + #define RNN(rname) ws->regnames.rname.name = #rname; + RNN(r_pixel) + RNN(r_inptr) + RNN(r_inshift) + RNN(r_inword) + RNN(r_maskinptr) + RNN(r_maskinword) + RNN(r_maskinshift) + RNN(r_masko) + RNN(r_temp1) + RNN(r_temp2) + RNN(r_c1632) + RNN(r_oditheradd) + RNN(r_blockroutine) + RNN(r_ecfindex) + RNN(r_bgcolour) + RNN(r_fetchroutine) + RNN(r_outptr) + RNN(r_outword) + RNN(r_outshift) + RNN(r_table) + RNN(r_xsize) + RNN(r_xcount) + RNN(r_ysize) + RNN(r_ycount) + RNN(r_inoffset) + RNN(r_maskinoffset) + RNN(r_in_pixmask) + + RNN(r1) + RNN(r2) + RNN(r3) + RNN(wp) + RNN(sp) + RNN(lr) + RNN(pc) + FOR_EACH_REGISTER_NAME(r) assert(r->name != 0, ERROR_FATAL); + ws->leave_r12_alone = FALSE; + } +#endif + } + assert(ws->check_code2 == CHECK_CODE, ERROR_FATAL); +} + +#ifdef DEBUG +static void dump_asm_workspace(asm_workspace *wp) +{ + /* Oddly spaced out to allow it to be easily lined up with the structure definition */ + tracef("Assembler workspace at %x:\n" _ wp); + tracef("save_outoffset=%i %t32. byte offset between output rows - SUBTRACT for next row.\n" _ wp->save_outoffset); + tracef("save_inoffset=%i %t32. byte offset between input rows - SUBTRACT for next row.\n" _ wp->save_inoffset); + tracef("save_inptr=0x%x %t32. word address of input pixels.\n" _ wp->save_inptr); + tracef("save_outptr=0x%x %t32. address of word containing first output pixel.\n" _ wp->save_outptr); + tracef("save_ydiv=%i %t32. subtracter value for y scale.\n" _ wp->save_ydiv); + tracef("save_yadd=%i %t32. adder value for y scale.\n" _ wp->save_yadd); + tracef("save_ysize=%i %t32. number of output rows.\n" _ wp->save_ysize); + tracef("save_ycount=%i %t32. total of ymag/ydiv sum, for y scale factor\n" _ wp->save_ycount); + newline(); + + tracef("save_inshift=%i %t32. bit shift of first pixel.\n" _ wp->save_inshift); + + + tracef("save_xsize=%i %t32. number of output pixels per row.\n" _ wp->save_xsize); + tracef("save_xcount=%i %t32. total of xmag/xdiv sum, for x scale factor\n" _ wp->save_xcount); + tracef("save_ecfptr=0x%x %t32. ECF pointer - only useful if plotting the mask.\n" _ wp->save_ecfptr); + tracef("save_ecflimit=0x%x %t32. ECF limit - only useful if plotting the mask.\n" _ wp->save_ecflimit); + + tracef("save_xdiv=%i %t32. subtracter value for x scale.\n" _ wp->save_xdiv); + tracef("save_xadd=%i %t32. adder value for x scale\n" _ wp->save_xadd); + newline(); + tracef("save_masko=%i %t32. if not 1bpp mask then this is mask data offset from inptr. Otherwise...\n" _ wp->save_masko); + tracef("save_xcoord=%i %t32. pixel x coordinate of first output pixel.\n" _ wp->save_xcoord); + tracef("save_ycoord=%i %t32. pixel y coordinate of first output pixel.\n" _ wp->save_ycoord); + + + + + + tracef("save_xmag=%i %t32. adder value for x scale?\n" _ wp->save_xmag); + tracef("save_ymag=%i %t32. adder value for y scale?\n" _ wp->save_ymag); + newline(); + + tracef("save_inlog2bpp=%i %t32. log 2 bits per pixel of input.\n" _ wp->save_inlog2bpp); + tracef("save_inlog2bpc=%i %t32. log 2 bits per character of input (only different for double-pixels).\n" + _ wp->save_inlog2bpc); + tracef("save_mode=%i (>>27 = %i) %t32. mode number/pointer of sprite - 1bpp sprites have hi bits set.\n" _ wp->save_mode _ wp->save_mode >> 27); + newline(); + + tracef("save_maskinshift=%i %t32. initial bit shift within mask word.\n" _ wp->save_maskinshift); + tracef("save_maskinptr=0x%x %t32. word address of mask (or 0 if there isn't one).\n" _ wp->save_maskinptr); + tracef("save_maskinoffset=%i %t32. byte offset between mask rows - SUBTRACT for next row.\n" _ wp->save_maskinoffset); + newline(); + + tracef("BPP=%i %t32. bits per pixel of output.\n" _ wp->BPP); + tracef("BPC=%i %t32. bits per character of output (only different for double pixels).\n" _ wp->BPC); + tracef("ColourTTR=0x%x %t32. translation table or palette.\n" _ wp->ColourTTR); + tracef("trns_palette=0x%x %t32. if non-0 ignore TTR and use this palette instead.\n" _ wp->trns_palette); + tracef("spritecode=%i (& 255 = %i) %t32. SpriteOp - 52 for PutSpriteScaled, 50 for PlotMaskScaled.\n" _ wp->spritecode _ wp->spritecode & 255); + tracef("bgcolour=%i %t32. Background colour (only valid if plotting the mask)\n" _ wp->bgcolour); + newline(); +} +#endif + +#ifdef TESTDEBUG +static void dump_workspace(workspace *ws) +{ + code_buffer *p; + + tracef("Dumping workspace.\n"); + #define DUMPINT(field) tracef("%s = %i.\n" _ #field _ ws->field); + DUMPINT(build_buffer) + FOR_EACH_BUFFER(p) tracef("buffer->keyword = %i.\n" _ p->key_word); +} +#endif + +/************************************************************************** +* * +* Low-level instruction generation. * +* * +**************************************************************************/ + +/* Condition codes */ +#define EQ 0xf0000000 /* It's 0 really - frigged so that 0 can be 'always' - the usual case. */ +#define NE 0x10000000 +#define CS 0x20000000 +#define CC 0x30000000 +#define MI 0x40000000 +#define PL 0x50000000 +#define VS 0x60000000 +#define VC 0x70000000 +#define HI 0x80000000 +#define LS 0x90000000 +#define GE 0xa0000000 +#define LT 0xb0000000 +#define GT 0xc0000000 +#define LE 0xd0000000 +#define AL 0xe0000000 +#define NV 0xDONOTUSE + +/* Branches */ +#define B 0x0a000000 +#define BL 0x0b000000 +#define B_OFFSET_MASK 0x00ffffff /* and with this for negative offsets */ + +/* ALU ops */ +#define S (1<<20) +#define AND(dst,op1,rest,str) ins(ws,(0x0 << 21) | DSTR(dst) | OP1R(op1) | (rest), str) +#define EOR(dst,op1,rest,str) ins(ws,(0x1 << 21) | DSTR(dst) | OP1R(op1) | (rest), str) +#define SUB(dst,op1,rest,str) ins(ws,(0x2 << 21) | DSTR(dst) | OP1R(op1) | (rest), str) +#define RSB(dst,op1,rest,str) ins(ws,(0x3 << 21) | DSTR(dst) | OP1R(op1) | (rest), str) +#define ADD(dst,op1,rest,str) ins(ws,(0x4 << 21) | DSTR(dst) | OP1R(op1) | (rest), str) +#define ADC(dst,op1,rest,str) ins(ws,(0x5 << 21) | DSTR(dst) | OP1R(op1) | (rest), str) +#define SBC(dst,op1,rest,str) ins(ws,(0x6 << 21) | DSTR(dst) | OP1R(op1) | (rest), str) +#define RSC(dst,op1,rest,str) ins(ws,(0x7 << 21) | DSTR(dst) | OP1R(op1) | (rest), str) +#define TST(op1,rest,str) ins(ws,(0x8 << 21) | S | OP1R(op1) | (rest), str) +#define TEQ(op1,rest,str) ins(ws,(0x9 << 21) | S | OP1R(op1) | (rest), str) +#define CMP(op1,rest,str) ins(ws,(0xa << 21) | S | OP1R(op1) | (rest), str) +#define CMN(op1,rest,str) ins(ws,(0xb << 21) | S | OP1R(op1) | (rest), str) +#define ORR(dst,op1,rest,str) ins(ws,(0xc << 21) | DSTR(dst) | OP1R(op1) | (rest), str) +#define MOV(dst,rest,str) ins(ws,(0xd << 21) | DSTR(dst) | (rest), str) +#define BIC(dst,op1,rest,str) ins(ws,(0xe << 21) | DSTR(dst) | OP1R(op1) | (rest), str) +#define MVN(dst,rest,str) ins(ws,(0xf << 21) | DSTR(dst) | (rest), str) + +#define ADD_OPCODE (0x4 << 21) +#define SUB_OPCODE (0x2 << 21) +#define MOV_OPCODE (0xd << 21) + +#define DSTR(x) ((x) << 12) /* destination - ignored by TST/TEQ/CMP/CMN */ +#define OP1R(x) ((x) << 16) /* first operand */ +#define OP2R(x) ((x) << 0) /* if !IMM */ +#define IMM(x) ((x) | (1<<25)) /* an 8-bit unsigned field */ +#define IMMROR(x) ((x) << 7) /* an EVEN number to rotate right IMM by */ + +#define LSLI(x) (((x) << 7) | 0x00) /* 5-bit immed shift applied to OP2R */ +#define LSRI(x) (((x) << 7) | 0x20) +#define ASRI(x) (((x) << 7) | 0x40) +#define RORI(x) (((x) << 7) | 0x60) + +#define LSLR(x) (((x) << 8) | 0x10) /* shift register applied to OP2R */ +#define LSRR(x) (((x) << 8) | 0x30) +#define ASRR(x) (((x) << 8) | 0x50) +#define RORR(x) (((x) << 8) | 0x70) + +/* Load and store ops */ +#define LDR(reg,basereg) (0x04100000 | ((reg) << 12)| ((basereg) << 16)) +#define STR(reg,basereg) (0x04000000 | ((reg) << 12)| ((basereg) << 16)) +#define LDRB(reg,basereg) (0x04500000 | ((reg) << 12)| ((basereg) << 16)) +#define STRB(reg,basereg) (0x04400000 | ((reg) << 12)| ((basereg) << 16)) + +#define WRITEBACK (1 << 21) +#define ADDOFFSET (1 << 23) /* else subtract */ +#define PREADD (1 << 24) /* else post */ + +#define OFFSET(x) (PREADD | ADDOFFSET | (x)) /* normal simple index */ +#define NEGOFFSET(x) (PREADD | (x)) /* subtract offset */ +#define PREINC(x) (WRITEBACK | ADDOFFSET | PREADD | (x)) +#define PREDEC(x) (WRITEBACK | PREADD | (x)) +#define POSTINC(x) (ADDOFFSET | (x)) /* The manual says, do not set WRITEBACK if doing post-addition */ +#define POSTDEC(x) ((x)) /* writeback will always occur, setting it is does LDRT/LDRBT */ + +#define PUSH (0x08000000 | (13<<16) /* register 13 */ \ + | (1<<21) /* write-back */ \ + | (1<<24) /* add offset before transfer */) +#define POP (0x08000000 | (13<<16) /* register 13 */ \ + | (1<<20) /* load from memory */ \ + | (1<<21) /* write-back */ \ + | (1<<23) /* add, not subtract */ ) + +#define LDMIA(reg) (0x08000000 | (reg<<16) /* register to load from */ \ + | (1<<20) /* load from memory */ \ + | (1<<23) /* add, not subtract */ ) + +#define STMIA(reg) (0x08000000 | (reg<<16) /* register to load from */ \ + | (1<<23) /* add, not subtract */ ) + +/* Supervisor call */ +#define SWI(swino) (0x0F000000 | swino) + +/* Indexed load - LSL shift assumed - writeback or negative not covered */ +#define INDEX(reg, shift) ((1<<25) | OFFSET(0) | OP2R(reg) | LSLI(shift)) + +/* Offset in assembler workspace */ +#define WP_OFFSET(field) OFFSET(((char*)&(wp->field)) - ((char*)&(wp->WP_FIRST_FIELD))) + +/* Define an assembler register */ +#define RN(name,no,describe) set_regname(ws, &ws->regnames.name, no, describe); + +#ifdef DEBUG +static void ldm_reg_list(workspace *ws, char *a, int regmask, BOOL lastname) +/* Construct a string in a which can be placed in curly brackets, describing + * a LDM/STM instruction. If lastname then find the last such register name in + * the case of duplicates - eg. the y-loop name rather than the x-loop name + * for the same physical register. + */ +{ + int i; + regname *r; + BOOL found; + char *aptr; + + a[0] = 0; + for (i = 0; i <= 15; i++) /* for each physical register */ + { + if ((regmask & (1<<i)) != 0) /* find a name for this register */ + { + found = FALSE; + aptr = a; + while (*aptr != 0) aptr++; /* points at the null at the end of the string */ + FOR_EACH_REGISTER_NAME(r) + { + if (r->regno == i) + { + *aptr = 0; /* If lastname and finding it again, delete last one */ + if (a[0] != 0) strcat(aptr, ","); + strcat(aptr, r->name); + found = TRUE; + if (!lastname) break; + } + } + assert(found, ERROR_FATAL); + } + } +} +#endif + +#ifdef DEBUG +static void ins(workspace *ws, int w, char *description) +#else +#define ins(ws,w,description) do_ins(ws,w) +static void do_ins(workspace *ws, int w) +#endif +/* Put an instruction into the output buffer. + * When debugging an assembler listings is generated too. These can be fed through + * objasm, and the results compared with the opcodes that I generate. + * Columns of assembler output: + * addressX opcodeXX label opcodes regs comment + * ^0 ^10 ^20 ^28 ^36 ^68 + */ +{ + int ccode = w & 0xf0000000; + + /* Handle the AL/EQ condition codes being wrong, so that 0 can be AL elsewhere. */ + if (ccode == 0xf0000000) w = w & 0x0fffffff; /* EQ code */ + else if (ccode == 0) w = w | 0xe0000000; /* AL code */ + /* All others are per the ARM expects */ + tracef("%x %x %t28.%s\n" _ + (ws->compile_ptr - ws->compile_base) * sizeof(int) _ + w _ description); /* pseudo-assembler format of output */ + + assert(ws->compile_ptr < ws->compile_lim, ERROR_NO_MEMORY); /* Check the buffer is big enough */ + *(ws->compile_ptr)++ = w; /* Store at then increment P% */ +} + +#ifdef DEBUG +#define DEFINE_LABEL(lab,describe) define_label(ws, L(lab), describe); +static void define_label(workspace *ws, label *lab, char *description) +#else +#define DEFINE_LABEL(lab,describe) define_label(ws, L(lab)); +static void define_label(workspace *ws, label *lab) +#endif +/* Define a label, and fill in a forward reference to it if necessary. */ +{ + assert(lab->def == 0, ERROR_FATAL); /* Check not defined twice */ + lab->def = ws->compile_ptr; + tracef("%t20.%s%t68.; %s\n" _ lab->name _ description); + if (lab->ref != 0) + { + int newvalue = *(lab->ref) | (B_OFFSET_MASK & (lab->def - (lab->ref + 2))); /* compute offset */ + tracef("%t20.; Zapping forward ref instruction at %x to be %x.\n" _ + sizeof(int) * (lab->ref - ws->compile_base) _ newvalue); + *(lab->ref) = newvalue; + lab->ref = 0; + } +} + +#ifdef DEBUG +static void branch(workspace *ws, unsigned int opcode, label *lab, char *description) +#else +#define branch(ws,opcode,lab,description) do_branch(ws,opcode,lab) +static void do_branch(workspace *ws, unsigned int opcode, label *lab) +#endif +/* Compile a branch instruction to a label. The opcode includes the condition code. */ +{ + if (lab->def == 0) /* Forward reference */ + { +#ifdef DEBUG + if (lab->ref != 0) + tracef("Already referenced at 0x%x\n" _ sizeof(int) * (lab->ref - ws->compile_base)); +#endif + assert(lab->ref == 0, ERROR_FATAL); /* Check for two forward refs to same label */ + lab->ref = ws->compile_ptr; + ins(ws, opcode, description); /* Just give as offset 0 for now */ + } + else + { + assert(lab->ref == 0, ERROR_FATAL); + ins(ws, + opcode | (B_OFFSET_MASK & (lab->def - (ws->compile_ptr + 2))), description); + } +} + +#ifdef DEBUG +static void set_regname(workspace *ws, regname *r, int regno, char *describe) +#else +#define set_regname(ws,r,regno,describe) do_set_regname(ws,r,regno) +static void do_set_regname(workspace *ws, regname *r, int regno) +#endif +/* Allocate a physical register number. If regno is -1 then allocate an + * as-yet-unused one, otherwise it's a specific register number. + */ +{ + if (regno == -1) /* allocate a number, one of 0..12 */ + { + regno = ws->next_free_reg; + ws->next_free_reg++; + assert(regno >= 0 && regno <= 12, ERROR_FATAL); /* Check for register overflow */ + if (regno == 12) assert(!ws->leave_r12_alone, ERROR_FATAL); + } + r->regno = regno; + tracef("%t20.%s%t27 RN %t36.%i %t68.; %s\n" _ r->name _ r->regno _ describe); +} + +static void align16(asm_workspace *wp, workspace *ws) +/* Align next instruction to quadword boundary */ +{ + UNUSED(wp); + while (((int) ws->compile_ptr) & 15 != 0) + MOV(R(r_pixel), OP2R(R(r_pixel)), "MOV r_pixel,r_pixel ; align to 16-byte boundary"); +} + +#if defined(DEBUG_TML) && defined(DEBUG) +static void write_reg(workspace *ws, regname *reg) +/* Sppol the register to the TML hardware */ +{ + comment(ws, "Write Register to TML card"); + tracef("Register to be output is... %s\n" _ reg->name); + ins(ws, PUSH | (1<<10) | (1<<11) | 1 | (1<<1) | (1<<14), "STMDB sp!,{r0,r1,r10,r11,r14} ; prepare to call SWI"); + ins(ws, MOV_OPCODE | DSTR(1) | OP2R(reg->regno), "MOV r1,r_somereg"); + ins(ws, MOV_OPCODE | DSTR(0) | OP2R(1), "MOV r0,r1"); + AND(0, 0, IMM(0xff), "AND r0,r0,#255 "); + ins(ws, SWI(HostFS_WriteC), "SWI HostFS_WriteC ; convert r1 value"); + ins(ws, MOV_OPCODE | DSTR(0) | OP2R(1) | LSRI(8), "MOV r0,r1 LSR #8"); + AND(0, 0, IMM(0xff), "AND r0,r0,#255 "); + ins(ws, SWI(HostFS_WriteC), "SWI HostFS_WriteC ; convert r1 value"); + ins(ws, MOV_OPCODE | DSTR(0) | OP2R(1) | LSRI(16), "MOV r0,r1 LSR #16"); + AND(0, 0, IMM(0xff), "AND r0,r0,#255 "); + ins(ws, SWI(HostFS_WriteC), "SWI HostFS_WriteC ; convert r1 value"); + ins(ws, MOV_OPCODE | DSTR(0) | OP2R(1) | LSRI(24), "MOV r0,r1 LSR #24"); + AND(0, 0, IMM(0xff), "AND r0,r0,#255 "); + ins(ws, SWI(HostFS_WriteC), "SWI HostFS_WriteC ; convert r1 value"); + ins(ws, POP | (1<<10) | (1<<11) | 1 | (1<<1) | (1<<14), "LDMIA sp!,{r0,r1,r10,r11,r14} ; restore after calling SWI"); + comment(ws, ""); +} +#endif + +static void compile_buffer_init(asm_workspace *wp, workspace *ws) +/* We intend to compile some code. Pick a buffer to use, and set up + * for generating into it. We use a simple round-robin for reusing buffers, + * rather than attempting to do LRU. + */ +{ + label *p; + regname *r; + code_buffer *b = &(ws->buffers[ws->build_buffer]); + ws->compile_base = &(b->code[0]); + ws->compile_ptr = ws->compile_base; + ws->compile_lim = ws->compile_base + BUFSIZE; + FOR_EACH_LABEL(p) {p->def = 0; p->ref = 0;} /* zap all the labels to be undefined. */ + FOR_EACH_REGISTER_NAME(r) r->regno = -1; + ws->next_free_reg = 0; /* allocate registers from 0 */ + tracef("Compile buffer initialised.\n"); + tracef("%t20; Blitting code for %s, scale factors %i:%i,%i:%i outoffset %x\n" _ + (PLOTMASK ? "PlotMaskScaled" : "PutSpriteScaled") _ + b->xadd - b->xdiv _ b->xdiv _ b->yadd _ b->ydiv _ wp->save_outoffset); + tracef("%t20; gcol action=%i in-bpp=%i out-bpp=%i in-dpix=%s out-dpix=%s mask=%s 1bppmask=%s palette=%s table=%s\n" _ + ws->gcol _ (1<<wp->save_inlog2bpp) _ wp->BPP _ + whether(DPIXEL_INPUT) _ whether(DPIXEL_OUTPUT) _ + whether(SOURCE_MASK) _ whether(SOURCE_BPPMASK) _ + whether(wp->trns_palette != 0) _ whether(wp->ColourTTR != 0)); + tracef("%t20.; Generated by compiler of (%s %s)\n" _ __DATE__ _ __TIME__); + comment(ws, "Get register and workspace definitions, turn on listing"); + tracef("%t28.GET w.GenHdr\n"); + tracef("%t28.OPT 1\n"); + RN(r1, 1, "r1"); + RN(r2, 2, "r2"); + RN(r3, 3, "r3"); + RN(wp, 12, "workspace pointer") + RN(sp, 13, "stack pointer") + RN(lr, 14, "link register") + RN(pc, 15, "program counter") + ws->leave_r12_alone = FALSE; /* by default, compiled code does not have module workspace pointer */ + UNUSED(wp); +} + +static void compile_buffer_done(workspace *ws) +/* Finished compiling code sequence. */ +{ +#ifdef DEBUG + label *p; +#endif + + tracef("%t28.END\n"); + tracef("Compile buffer done, %i words generated.\n" _ ws->compile_ptr - ws->compile_base); + /* Increment pointer for next buffer to reuse. */ + ws->build_buffer++; + if (ws->build_buffer >= NBUFFERS) ws->build_buffer = 0; +#ifdef DEBUG + /* Check no unresolved references to labels */ + FOR_EACH_LABEL(p) + { + IFDEBUG(if(p->ref != 0) tracef("Unresolved reference to label %s at %x\n" _ p->name _ sizeof(int) * (p->ref - ws->compile_base));) + assert(p->ref == 0, ERROR_FATAL); + } +#endif + /* ws->compile_base can be used as the base of the resulting procedure. */ +} + +/************************************************************************** +* * +* Test main entry sequence, low level IO and code generation. * +* * +**************************************************************************/ + +#ifdef TESTDEBUG +blitter putscaled_compiler(asm_workspace *wp, workspace *ws, workspace *ws_end, int gcol) +{ + /* Test low-level output */ + writes(0, "Hello there!\nhi!\n"); + writech(0, 'X'); + writehex(0, 0x5732abcd, 8); + writech(0, '_'); + tracef("Test tracef, esc:%%, string:'%s', char:'%c', int:'%i', hex:'%x'.\n", + "hello", 'X', 1234567, 0x6789abcd); + + /* Check that the assembler has an adequate opinion of our workspace needs. */ + tracef("wp=%x ws=%x ws_end=%x.\n" _ wp _ ws _ ws_end); + tracef("Size of assembler workspace: %i.\n" _ ((char*)ws) - ((char*)wp)); + tracef("Size of C workspace: %i.\n" _ ((char*)ws_end) - ((char*)ws)); + assert(ws_end > ws, ERROR_NO_MEMORY); + check_workspace(ws); + dump_workspace(ws); + + compile_buffer_init(wp, ws); + + /* Check compilation of all instruction forms. The resulting trace output can then + * have its binary details stripped, be run through objasm, and the resulting listings + * compared to check the bit patterns generated. + */ + ADD(5, 6, GT | OP2R(7), "ADDGT r5,r6,r7"); + branch(ws, B, L(loop_y_exit), "B loop_y_exit"); + ORR(2, 3, S | IMM(23), "ORRS r2,r3,#23"); + DEFINE_LABEL(test1, "test label") + EOR(1, 2, EQ | IMM(255) | IMMROR(10), "EOREQ r1,r2,#(255:ROR:10)"); + comment(ws, "This is a comment"); + DEFINE_LABEL(test2, "test label") + branch(ws, B + NE, L(loop_y_repeat), "BNE loop_y_repeat"); + branch(ws, BL + EQ, L(loop_y_repeat), "BLEQ loop_y_repeat"); + CMP(8, OP2R(9) | LSLI(12), "CMP r8,r9,LSL #12"); + CMP(8, OP2R(9) | ASRR(6), "CMP r8,r9,ASR r6"); + MOV(3, OP2R(4) | RORI(1), "MOV r3,r4,ROR #1"); + MOV(R(pc)), OP2R(R(lr)), "MOV pc,lr"); + + ins(ws, LDR(8,3) | OFFSET(249), "LDR r8,[r3,#249]"); + ins(ws, LDR(8,3) | OFFSET(0), "LDR r8,[r3]"); + ins(ws, STR(1,2) | GT + NEGOFFSET(12), "STRGT r1,[r2,#-12]"); + ins(ws, LDRB(1,2) | PREINC(4), "LDRB r1,[r2,#4]!"); + ins(ws, STRB(6,7) | POSTINC(4), "STRB r6,[r7],#4"); + ins(ws, LDRB(1,2) | PREDEC(4), "LDRB r1,[r2,#-4]!"); + ins(ws, STRB(6,7) | POSTDEC(4), "STRB r6,[r7],#-4"); + ins(ws, LDRB(8,3) | INDEX(4,0), "LDRB r8,[r3,r4]"); + ins(ws, LDR(8,3) | INDEX(4,2), "LDR r8,[r3,r4,LSL #2]"); + + ins(ws, PUSH | GT | (1<<4) | (1<<5) | (1<<6), "STMGTDB sp!,{r4,r5,r6}"); + ins(ws, POP | (1<<4) | (1<<5) | (1<<6), "LDMIA sp!,{r4,r5,r6}"); + ins(ws, POP | (1<<4) | (1<<5) | (1<<6) | (1<<R(pc)),"LDMIA sp!,{r4,r5,r6,pc}"); + + { + char a[256]; + char a2[256]; + int regmask = (1<<13) | (1<<15); /* pretty arbitrary silly one actually */ + + ldm_reg_list(ws, a, regmask, FALSE); + do_sprintf(a2, "LDMIA lr,{%s}", a) + ins(ws, LDMIA(lr) | regmask, a2); + do_sprintf(a2, "STMIA l4,{%s}", a) + ins(ws, STMIA(lr) | regmask, a2); + } + + branch(ws, BL + EQ, &ws->labels.loop_y_repeat, "BLEQ loop_y_repeat"); + + compile_buffer_done(ws); + + writes(0, "Exit.\n"); +} +#else + +/************************************************************************** +* * +* Bitblit: Evaluate conditions. * +* * +**************************************************************************/ + +static BOOL simple_x_scale(asm_workspace *wp, workspace *ws) +/* Return true if 1:1 along x */ +{ + return ( wp->save_xadd - wp->save_xdiv == wp->save_xdiv + && wp->save_xdiv <= wp->save_xcount + && !PLOTMASK + && ws->gcol == 0 + && !ws->odither /* CAN be done, but the code sequences get awfully big so let's cut it out for now. */ + ? TRUE : FALSE); + /* Without the second test we MIGHT have to omit the first pixel, which the 1:1 code doesn't allow for. */ + /* The 2-at-a-time loop doesn't allow for PLOTMASK - not important enough. */ + /* The 2-at-a-time loop doesn't allow for any gcol but 0 - not important enough. */ +} + +static BOOL x_block_move(asm_workspace *wp, workspace *ws) +/* Returns true if the inner loop is the simple movement of a block of bits */ +{ + return ( simple_x_scale(wp, ws) + && wp->BPC == (1<<wp->save_inlog2bpc) + && ws->gcol == 0 + && !SOURCE_MASK + && !SOURCE_TABLE + && wp->cal_table == 0 + ? TRUE : FALSE); +} + +static BOOL simple_y_scale(asm_workspace *wp, workspace *ws) +/* Return true if 1:1 along y */ +{ + UNUSED(ws); + return wp->save_yadd == wp->save_ydiv; +} + +static int palette_is_grey(int *palette, int entries) +/* Scan a palette looking how they increment to deduce if it's just greyscale */ +{ + int loop; + int entry; + int ascending = 1; + + for (loop=0;loop<entries;loop++) + { + entry = palette[loop]; + + if (((entry ^ (entry>>8)) & 0xffff00) != 0) + return 0; + if ((entry & 0xff00)>>8 != loop) + ascending = 0; + } + if (ascending) + return 2; + return 1; +} + +/************************************************************************** +* * +* Bitblit: Register allocation. * +* * +**************************************************************************/ + +static void ptrs_rn(asm_workspace *wp, workspace *ws) +/* Declare the pointer registers, which must be visible in both the x-loop and the y-loop */ +{ + /* r_pixel is always needed, and need not be saved between loops. + * So, we put it in r14 to remove the need for the register allocator + * to worry about r14. + */ + RN(r_pixel, 14, "fetched and translated pixel") + + /* In most cases there are not enough registers, and the control of + * the outer (y) loop requires swapping two 'banks' of registers. + * inptr, outptr (and maskinptr if it exists) are always registers + * r0, r1, r2, and they are visible when the y registers are swapped in. + */ + RN(r_inptr, -1, PLOTMASK ? "ECF pattern pointer" : "input word pointer") + RN(r_outptr, -1, "word pointer to output") + if (SOURCE_BPPMASK || PLOTMASK) RN(r_maskinptr, -1, "mask input word pointer") + + if (ws->odither) RN(r_oditheradd, -1, "ordered dither offset value") + /* The initial dither add value needs to be changed for every output line, + * so it helps to have r_oditheradd visible in the y loop + */ +} + +static void xloop_rn(asm_workspace *wp, workspace *ws) +/* Other variables for the x-loop */ +{ + int need_temps = 0; /* set to 1 or 2 if temp1 and temp2 are needed */ + if (x_block_move(wp, ws)) + { + /* X loop is very very simple, and communicates with machine-code block-shift routine. */ + RN(r_inshift, -1, "Number of (most sig) bits of first input word to transfer, in 1..32") + RN(r_outshift, -1, "Number of (most sig) bits of first output word to fill, in 1..32") + RN(r_xsize, -1, "Number of bits to transfer per row") + RN(r_blockroutine, -1, "Block transfer routine") + /* Those registers had better be the same ones as the assembler code is expecting! */ + assert(ws->regnames.r_inptr.regno == 0, ERROR_FATAL); + assert(ws->regnames.r_outptr.regno == 1, ERROR_FATAL); + assert(ws->regnames.r_inshift.regno == 2, ERROR_FATAL); + assert(ws->regnames.r_outshift.regno == 3, ERROR_FATAL); + assert(ws->regnames.r_xsize.regno == 4, ERROR_FATAL); + } + else + { + /* Normal case - declare whatever other registers are needed for fetching and translating pixels. */ + if (PLOTMASK) + RN(r_inword, -1, "ECF pattern input word") + else if (!SOURCE_32_BIT) /* if not 32-bit source */ + { + RN(r_inshift, -1, "bit shift of current pixel LSL #27") + RN(r_inword, -1, "current input word") + } + if (SOURCE_MASK) + { + RN(r_maskinword, -1, "current mask word") + if (SOURCE_BPPMASK || PLOTMASK) + RN(r_maskinshift, -1, "bit shift of current mask pixel") + else + RN(r_masko, -1, "offset of mask data from sprite data") + } + if ( need_temps == 0 + && (ws->gcol != 0) + && DEST_32_BIT /* use in save_pixel */ + ) + need_temps = 1; + + if (PLOTMASK) + { + RN(r_ecfindex, -1, "index into ECF pattern") + RN(r_bgcolour, -1, "background plotting colour") + } + else + { + if (SOURCE_TABLE || wp->cal_table) RN(r_table, -1, "translation table or palette") + { + /* Work out whether we need 16->32 or 32->16 transformations, with their temp registers + * So, mirror the structure of translate_pixel + */ + int pixl2bpp = wp->save_inlog2bpp; + + if ((wp->trns_palette != 0) && (wp->BPP != 16)) pixl2bpp = 5; + + if (pixl2bpp == 5 && wp->BPP != 32) need_temps = 2; + + if (pixl2bpp == 4 && wp->BPP == 32) + { + need_temps = 2; + RN(r_c1632, -1, "constant for 16->32 transformation") + } + } + + if ( need_temps == 0 + && (wp->save_xmag % wp->save_xdiv) == 0 + && (wp->save_xmag / wp->save_xdiv) > 4 /* used in optimised scale up */ + ) + need_temps = 1; + } + + /* Declare whatever registers needed for saving the new pixel + * into the current destination pixel. + */ + if (!DEST_32_BIT) + { + RN(r_outword, -1, "current output word") + RN(r_outshift, -1, "bit shift of current pixel in current output word LSL 27") + } + + if (wp->save_inlog2bpp <= 3 && simple_x_scale(wp, ws)) + /* going to use 2-at-a-time loop - if 16bpp or more, don't need this register. */ + RN(r_in_pixmask, -1, "pixel mask for 2-at-a-time loop") + + /* Declare whatever registers are needed for control of + * horizontal scaling. For some simple cases no scaling registers + * are needed. + */ + RN(r_xsize, -1, "number of output pixels per row") + if (!simple_x_scale(wp, ws)) /* not 1:1 scale */ + RN(r_xcount, -1, "total for x scale") + /* Adder and subractor values become constants in the code. */ + } + + /* The temporaries are shuffled to the end, so that if r12 (the assembler wp) is used then + * it does not get loaded before the y loop variables are initialised. + */ + if (need_temps >= 1) RN(r_temp1, -1, "temp1 for pixel transformation temporary values") + if (need_temps >= 2) RN(r_temp2, -1, "temp2 for pixel transformation temporary values") + + /* MAX POSSIBLE REQUIREMENT - 13, if vcount stuff not done. + * It may appear 15, but temp1 and temp2 are only needed if one of src/dst + * is 32bpp, in which case we save elsewhere. + * >>> AH not so, they are also needed if a palette is used, in which case + * the source can be fewer bpp. Ooops. Can we ever overflow? Not sure. + */ +} + +static int yloop_rn_count(asm_workspace *wp, workspace *ws) +/* Say how many registers yloop_rn will declare */ +{ + int result = 2; /* r_ysize, r_inoffset */ + if (wp->save_yadd != wp->save_ydiv) result++; /* r_ycount */ + if (SOURCE_BPPMASK || PLOTMASK) result++; /* r_maskinoffset */ + if (wp->is_it_jpeg) result++; /* r_fetchroutine */ + return result; +} + +static void yloop_rn(asm_workspace *wp, workspace *ws) +/* Declare whatever registers are needed for control of + * the vertical loop. These registers are part of a separate 'bank' + * from those in the central loop. + */ +{ + RN(r_ysize, -1, "number of output rows"); + if (!simple_y_scale(wp, ws)) /* not 1:1 scale */ + RN(r_ycount, -1, "total for y scale") + + /* Adder and subractor values become constants in the code. */ + RN(r_inoffset, -1, "byte offset between input rows.") + if (SOURCE_BPPMASK || PLOTMASK) RN(r_maskinoffset, -1, "byte offset between mask rows.") + if (wp->is_it_jpeg) RN(r_fetchroutine, -1, "routine for getting row of decompressed JPEG data.") + + /* MAX POSSIBLE REQUIREMENT - 5 registers */ +} + +/************************************************************************** +* * +* Bitblit: Register initialisation. * +* * +**************************************************************************/ + +/* Loading a constant index from the workspace pointer */ +#define LDR_WP(reg,value) ins(ws, LDR(R(reg),R(wp)) + WP_OFFSET(value), \ + "LDR " #reg "," #value); + +#ifdef DEBUG + #define LDR_WP_C(reg,value, comment) \ + { \ + char a[256]; \ + do_sprintf(a, "LDR " #reg "," #value " %t40.; " comment); \ + ins(ws, LDR(R(reg),R(wp)) + WP_OFFSET(value), a); \ + } +#else + #define LDR_WP_C(reg,value, comment) ins(ws, LDR(R(reg),R(wp)) + WP_OFFSET(value), 0); +#endif + +/* Loading a constant index from a register */ +#ifdef DEBUG + #define LDR_INDEX(destreg,indexreg,offset,comment) \ + { \ + char a[256]; \ + do_sprintf(a, "LDR " #destreg ",[" #indexreg ", #%i] %t40.; " comment, offset); \ + ins(ws, LDR(R(destreg),R(indexreg)) | OFFSET(offset), a); \ + } +#else + #define LDR_INDEX(destreg,indexreg,offset,comment) ins(ws, LDR(R(destreg),R(indexreg)) | OFFSET(offset), 0); +#endif + +static void get_in_shift(asm_workspace *wp, workspace *ws) +/* Used within fetch_pixel_init, to load r_inshift. The complication is + * that if this is JPEG data then the save_inshift value was not calculated, + * because SpriteExtend assembler stuff thought this was 32bit data. This + * only matters if JPEG is being made to produce 8bpp or 16bpp data. + */ +{ + if (wp->is_it_jpeg && wp->save_inlog2bpp != 5) + { + LDR_WP_C(r_inshift, in_x, "input x coord (JPEG input data)") + if (wp->save_inlog2bpp == 4) + { + AND(R(r_inshift), R(r_inshift), S | IMM(1), "ANDS r_inshift,r_inshift,#1 ; halfword offset (0 or 1)"); + MOV(R(r_inshift), EQ | IMM(2), "MOVEQ r_inshift,#2 ; halfword offset (1 or 2)"); + MOV(R(r_inshift), OP2R(R(r_inshift)) | LSLI(4), "MOV r_inshift,r_inshift,LSL #4 ; 16/32 bit offset"); + } + else /* wp->save_inlog2bpp == 3 */ + { + AND(R(r_inshift), R(r_inshift), S | IMM(3), "ANDS r_inshift,r_inshift,#3 ; byte offset as 0/1/2/3"); + RSB(R(r_inshift), R(r_inshift), IMM(4), "RSB r_inshift,r_inshift,#4 ; byte offset as 4/3/2/1"); + MOV(R(r_inshift), OP2R(R(r_inshift)) | LSLI(3), "MOV r_inshift,r_inshift,LSL #3 ; 8/16/24/32 bit offset"); + } + } + else + { + LDR_WP_C(r_inshift, save_inshift, "input initial shift") + RSB(R(r_inshift), R(r_inshift), IMM(32), "RSB r_inshift,r_inshift,#32 ; pixels of first word to transfer, in 1..32"); + } +} + +static void fetch_pixel_init(asm_workspace *wp, workspace *ws) +/* Initialise whatever registers are needed for fetching and translating + * pixels. + */ +{ + /* The input word pointer */ + if (PLOTMASK) + { + LDR_WP_C(r_inptr, save_ecflimit, "base of ECF pattern") + } + else if (wp->is_it_jpeg) + { + LDR_WP_C(r_inptr, in_y, "initial y coordinate (for JPEG data)") + } + else /* normal data source for PutSpriteScaled */ + { + LDR_WP_C(r_inptr, save_inptr, "input word pointer") + } + + /* all other registers re fetching input data */ + if (x_block_move(wp, ws)) + { + /* Prepare for machine code core to inner loop */ +#if 0 + LDR_WP_C(r_inshift, save_inshift, "input initial shift (for block move)") + RSB(R(r_inshift), R(r_inshift), IMM(32), "RSB r_inshift,r_inshift,#32 ; pixels of first word to transfer, in 1..32"); +#else + get_in_shift(wp, ws); +#endif + LDR_WP(r_blockroutine, ccompiler_bitblockmove) + } + else + { + /* initialise r_inptr */ + if (PLOTMASK) + { + LDR_WP(r_ecfindex, save_ecfptr) /* byte index into ECF pattern, not rounded */ + AND(R(r_pixel), R(r_ecfindex), IMM(0x18), "AND r_pixel,r_ecfindex,#&18 ; extract initial row offset in ECF"); + ADD(R(r_inptr), R(r_inptr), OP2R(R(r_pixel)), "ADD r_inptr,r_inptr,r_pixel ; and add to initial ECF row address"); + LDR_WP(r_bgcolour, bgcolour) /* background colour pixel */ + } + else + { + /* r_inword and r_inshift */ + if (!SOURCE_32_BIT) /* if not 32-bit source */ + { + /* r_inword not initialised yet, done in inner loop */ +#if 0 + LDR_WP(r_inshift, save_inshift) + RSB(R(r_inshift), R(r_inshift), IMM(32), "RSB r_inshift,r_inshift,#32 ; pixels still to shift"); +#else + get_in_shift(wp, ws); +#endif + MOV(R(r_inshift), OP2R(R(r_inshift)) | LSLI(27), "MOV r_inshift,r_inshift,LSL #27 ; keep up at top end of register"); + } + } + + /* mask registers */ + if (SOURCE_MASK) + { + if (SOURCE_BPPMASK || PLOTMASK) + { + LDR_WP(r_maskinshift, save_maskinshift) + if (SOURCE_BPPMASK) + { + LDR_WP(r_maskinptr, save_maskinptr) + } + else /* PLOTMASK and not BPPMASK */ + { + LDR_WP_C(r_maskinptr, save_inptr, "mask pointer for PlotMaskScaled") + LDR_WP(r_pixel, save_masko) /* temp use of r_pixel */ + ADD(R(r_maskinptr), R(r_maskinptr), OP2R(R(r_pixel)),"ADD r_maskinptr,r_maskinptr,r_pixel ; mask pointer (for PlotMask)"); + } + RSB(R(r_maskinshift), R(r_maskinshift), IMM(32), "RSB r_maskinshift,r_maskinshift,#32 ; pixels still to shift"); + MOV(R(r_maskinshift), + OP2R(R(r_maskinshift)) | LSLI(27), "MOV r_maskinshift,r_maskinshift,LSL #27 ; keep up at top end of register"); + } + else + LDR_WP(r_masko, save_masko) + } + + /* translation registers */ + if (wp->cal_table) LDR_WP(r_table, cal_table) + else if (wp->trns_palette != 0) LDR_WP(r_table, trns_palette) + else + { + if (wp->ColourTTR != 0) + { + LDR_WP(r_table, ColourTTR) + if ( wp->BPP <= 8 /* 256 colours or less on output */ + && wp->save_inlog2bpp >= 4 /* thousands or millions of input colours */ + ) + { + ins(ws, LDR(R(r_table), R(r_table)) | OFFSET(4), "LDR r_table,[r_table,#4] ; load base of 32K table"); + } + } + } + + if (wp->save_inlog2bpp <= 3 && simple_x_scale(wp, ws)) + MOV(R(r_in_pixmask), IMM(ws->in_pixmask), "MOV r_in_pixmask,#in_pixmask ; for use in 2-at-a-time loop"); + + /* temp1 and temp2 need no initialisation. */ + if (ws->regnames.r_c1632.regno != -1) /* Generate binary constant 0000000011100000 1110000011100000 */ + { + MOV(R(r_c1632), IMM(0xe0), "MOV r_c1632,#&e0 ; 0000000000000000 0000000011100000"); + ORR(R(r_c1632), R(r_c1632),OP2R(R(r_c1632))|LSLI(8),"ORR r_c1632,r_c1632,r_c1632,LSL #8 ; 0000000000000000 1110000011100000"); + ORR(R(r_c1632), R(r_c1632),OP2R(R(r_c1632))|LSLI(8),"ORR r_c1632,r_c1632,r_c1632,LSL #8 ; 0000000011100000 1110000011100000"); + } + + if (ws->odither) + { + /* We use ordered dither to attempt to increase the output resolution by almost two bits. + * This only happens for a 16bpp or 32bpp source that's being truncated somewhat. + * A square of output pixels has the following binary addition values: + * 11 01 + * 00 10 + * These values are added to the value of each or R/G/B, just before those values are + * truncated or looked up in a table, shifted so that we add to the bits which are + * just about to be discarded. + * We keep the value to add in r_oditheradd. + * To proceed along the x axis we EOR by 10 every output pixel. + * We must also EOR by 01 every line. + * The starting value must be aligned with the origin of the output. + */ + comment(ws, "Compute initial dither addition value - bit 0 changes every y, bit 1 every x"); + LDR_WP(r_pixel, save_xcoord) + AND(R(r_pixel), R(r_pixel), IMM(1), "AND r_pixel,r_pixel,#1 ; least sig bit of x, for dither"); + LDR_WP(r_oditheradd, save_ycoord) + AND(R(r_oditheradd), R(r_oditheradd), IMM(1), "AND r_oditheradd,r_oditheradd,#1 ; least sig bit of y, for dither"); + EOR(R(r_pixel),R(r_pixel),OP2R(R(r_oditheradd)), "EOR r_pixel,r_pixel,r_oditheradd ; if we start Y off on an odd footing, invert x as well"); + ORR(R(r_oditheradd), R(r_oditheradd), + OP2R(R(r_pixel)) | LSLI(1), "ORR r_oditheradd,r_oditheradd,r_pixel,LSL #1 ; dither add value"); + + /* The dither should start based on the current ECF offset */ + MOV(R(r_pixel),IMM(0x10) | IMMROR(24), "MOV r_pixel,#&1000 ; prepare to get ECFYOffset"); + LDR_INDEX(r_pixel,r_pixel,0x1FC,"get kernel variable ECFYOffset from &11FC") + TST(R(r_pixel),IMM(1), "TST r_pixel,#1 ; is Y ECF offset odd?"); + EOR(R(r_oditheradd),R(r_oditheradd),NE | IMM(3), "EORNE r_oditheradd,r_oditheradd,#3 ; if so, change ordered dither origin to match"); + MOV(R(r_pixel),IMM(0x10) | IMMROR(24), "MOV r_pixel,#&1000 ; prepare to get ECFShift"); + LDR_INDEX(r_pixel,r_pixel,0x1F8,"get kernel variable ECFShift from &11F8") + TST(R(r_pixel),IMM(wp->BPP), "TST r_pixel,#out_bpp ; is ECF Shift an odd number of pixels?"); + EOR(R(r_oditheradd),R(r_oditheradd),NE | IMM(2), "EORNE r_oditheradd,r_oditheradd,#2 ; if so, change ordered dither origin to match"); + + /* Shift the dither value to the top of the register. */ + { + IFDEBUG(char a[256];) + IFDEBUG(do_sprintf(a, "MOV r_oditheradd,r_oditheradd,LSL #%i %t40; shift to top of word", 23 + ws->odither);) + MOV(R(r_oditheradd), OP2R(R(r_oditheradd)) | LSLI(23 + ws->odither), a); + } + } + } + newline(); +} + +static void save_pixel_init(asm_workspace *wp, workspace *ws) +/* Initialise whatever registers are needed for saving the new pixel + * into the current destination pixel. + */ +{ + LDR_WP(r_outptr, save_outptr) + + if (x_block_move(wp, ws)) + { + /* Very simple inner loop */ + LDR_WP_C(r_pixel, save_xcoord, "get initial output x coord in pixels") /* Measured in pixels */ + AND(R(r_outshift), R(r_pixel), IMM(ws->out_ppw-1), "AND r_outshift,r_pixel,#out_ppw-1 ; pix offset of start"); + MOV(R(r_outshift),OP2R(R(r_outshift)) | LSLI(ws->out_l2bpc), "MOV r_outshift,r_outshift,LSL #out_l2bpc ; bit offset of start, in 0..31"); + RSB(R(r_outshift), R(r_outshift), IMM(32), "RSB r_outshift,r_outshift,#32 ; pixels of space, in 1..32"); + } + else + { + /* Normal cases */ + if (PLOTMASK || !DEST_32_BIT) + LDR_WP_C(r_pixel, save_xcoord, "output x coord measured in pixels") + + if (PLOTMASK) + { + MOV(R(r_ecfindex), OP2R(IMM(0)), "MOV r_ecfindex, #0 ; should always be 0 ?"); +#if 0 + AND(R(r_ecfindex), R(r_pixel), IMM(ws->out_ppw), "AND r_ecfindex,r_pixel,#out_ppw ; pixels into ECF pattern"); + /* Convert from pixels, to byte offset into ECF line - either 0 or 4 */ + if (ws->out_l2ppw > 2) /* > 4 output pixels per word */ + MOV(R(r_ecfindex), OP2R(R(r_ecfindex)) + | LSRI(ws->out_l2ppw - 2), "MOV r_ecfindex,r_ecfindex,LSR #out_l2ppw-2 ; convert to byte offset"); + if (ws->out_l2ppw < 2) /* < 4 output pixels per word (ie 2 or 1) */ + MOV(R(r_ecfindex), OP2R(R(r_ecfindex)) + | LSLI(2 - ws->out_l2ppw), "MOV r_ecfindex,r_ecfindex,LSL #2-out_l2ppw ; convert to byte offset"); +#endif + } + + if (!DEST_32_BIT) + { + AND(R(r_outshift), R(r_pixel), IMM(ws->out_ppw-1), "AND r_outshift,r_pixel,#out_ppw-1 ; pixel offset of start"); + MOV(R(r_outshift),OP2R(R(r_outshift)) | LSLI(ws->out_l2bpc),"MOV r_outshift,r_outshift,LSL #out_l2bpc ; bit offset of start"); + RSB(R(r_outshift), R(r_outshift), IMM(32), "RSB r_outshift,r_outshift,#32 ; pixels still to rotate"); + MOV(R(r_outshift), OP2R(R(r_outshift)) | LSLI(27), "MOV r_outshift,r_outshift,LSL #27 ; up at the top"); + } + } +} + +static void xloop_init(asm_workspace *wp, workspace *ws) +/* Initialise whatever registers are needed for control of + * horizontal scaling. For some simple cases no scaling registers + * are needed. + */ +{ + LDR_WP(r_xsize, save_xsize) + if (!simple_x_scale(wp, ws)) /* not 1:1 scale */ + { + if ((ws->odither) && (SOURCE_16_BIT)) + { + LDR_WP(r_pixel, save_xcount); /* Changed by (GPS) to fix register spill bug*/ + } + else + { + LDR_WP(r_xcount, save_xcount); + } + } + if (x_block_move(wp, ws)) + MOV(R(r_xsize), OP2R(R(r_xsize)) | LSLI(ws->out_l2bpc), "MOV r_xsize,r_xsize,LSL #out_l2bpc ; size in bits"); +} + +static void yloop_init(asm_workspace *wp, workspace *ws) +/* Initialise whatever registers are needed for control of + * the vertical loop. These registers are part of a separate 'bank' + * from those in the central loop. + */ +{ + if (wp->is_it_jpeg) LDR_WP_C(r_fetchroutine, fetchroutine, "routine to call to get JPEG data line") + LDR_WP(r_ysize, save_ysize) + if (!simple_y_scale(wp, ws)) /* not 1:1 scale */ LDR_WP(r_ycount, save_ycount) + if (!PLOTMASK) + { + if (wp->is_it_jpeg) + /* We could save this register, but there's not all that much point - simpler to code like this. */ + MOV(R(r_inoffset),IMM(1), "MOV r_inoffset,#1 ; JPEG coord offset on input"); + else + LDR_WP(r_inoffset, save_inoffset) + } + if (SOURCE_BPPMASK) LDR_WP(r_maskinoffset, save_maskinoffset) + else if (PLOTMASK) LDR_WP(r_maskinoffset, save_inoffset) +} + +/************************************************************************** +* * +* Bitblit: Pixel loading, translation, saving. * +* * +**************************************************************************/ +static void fetch_pixel_unmasked(asm_workspace *wp, workspace *ws) +/* Assuming no mask, get the next input pixel and put it in r_pixel. This is separated + * from fetch_pixel for the case of scaling up an ordered dither, where the same input + * pixel is repeatedly fetched and translated. + */ +{ + if (PLOTMASK) + { + comment(ws, "Fetch an ECF pixel"); + if (DEST_32_BIT) + { + ins(ws, LDR(R(r_inword), R(r_inptr)) + | INDEX(R(r_ecfindex), 0), "LDR r_inword,[r_inptr,r_ecfindex] 2222"); + ADD(R(r_ecfindex), R(r_ecfindex), + IMM(4), "ADD r_ecfindex,r_ecfindex,#4 5t453"); + ins(ws, LDR(R(r_bgcolour), R(r_inptr)) + | INDEX(R(r_ecfindex), 0), "LDR r_bgcolour,[r_inptr,r_ecfindex] ; load next EOR word of ECF222"); + SUB(R(r_ecfindex), R(r_ecfindex), + IMM(4), "SUB r_ecfindex,r_ecfindex,#4 1212"); + } + else + { + if (DEST_16_BIT) + { + MOV(R(r_pixel), OP2R(R(r_inword)) | LSLI(16), "MOV r_pixel,r_inword,LSL #16 ; fetch 16 bit ECF pattern pixel"); + MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(16), "MOV r_pixel,r_pixel,LSR #16"); + } + else + { +// AND(R(r_pixel), R(r_inword), IMM(ws->out_pixmask),"AND r_pixel,r_inword,#out_pixmask ; fetch the pixel from the ECF pattern"); +// AND(R(r_pixel), R(r_pixel), OP2R(R(r_bgcolour)), "AND r_pixel,r_pixel,r_bgcolour ; turn it into a background colour pixel"); + } + } + } + else + { + comment(ws, "Fetch a source pixel"); + if (SOURCE_32_BIT) + ins(ws, LDR(R(r_pixel), R(r_inptr)) | OFFSET(0), "LDR r_pixel,[r_inptr]"); + else if (SOURCE_16_BIT) + { + MOV(R(r_pixel), OP2R(R(r_inword)) | LSLI(16), "MOV r_pixel,r_inword,LSL #16 ; fetch 16 bit pixel"); + MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(16), "MOV r_pixel,r_pixel,LSR #16"); + /* >>> Maybe we can leave it in the top 16 bits, and get by? Not yet. */ + } + else + { + AND(R(r_pixel), R(r_inword), IMM(ws->in_pixmask), "AND r_pixel,r_inword,#in_pixmask ; fetch the pixel"); + } + } +} + +static BOOL fetch_pixel(asm_workspace *wp, workspace *ws, label *l_masked) +/* Check the mask, fetch the current pixel. If the current pixel is + * transparent then branch out to l_masked. Return TRUE if the branch could be + * taken, else FALSE. + */ +{ + IFDEBUG(char a[256];) + + if (SOURCE_MASK) + { + TST(R(r_maskinword), IMM(1), "TST r_maskinword,#1"); + IFDEBUG(do_sprintf(a, "BEQ %s", l_masked->name);) + branch(ws, B | EQ, l_masked, a); + } + + fetch_pixel_unmasked(wp, ws); + + return SOURCE_MASK; +} + +static BOOL fetch_pixel2(asm_workspace *wp, workspace *ws, label *l_masked) +/* Check the mask, fetch the pixel after the current one. You are assured + * that no word of input need be loaded between these two. If the pixel is + * transparent then branch out to l_masked. Return TRUE if the branch could be + * taken, else FALSE. + */ +{ +#ifdef DEBUG + char a[256]; +#endif + + assert(!PLOTMASK, ERROR_FATAL); /* Doesn't do 2-at-a-time loop */ + + if (SOURCE_MASK) /* Test the second pixel of mask */ + { + if (SOURCE_BPPMASK) /* we may have reached the end of mask word if not doing an aligned plot */ + { + MOV(R(r_maskinword), OP2R(R(r_maskinword)) + | RORI(ws->mask_bpp), "x"/*MOV r_maskinword,r_maskinword,ROR #mask_bpp"*/); + SUB(R(r_maskinshift),R(r_maskinshift), + S | IMM(ws->mask_bpp*2) | IMMROR(6), "x"/*SUBS r_maskinshift,r_maskinshift,#mask_bpp:SHL:27"*/); + ins(ws, LDR(R(r_maskinword), R(r_maskinptr)) + | EQ | WRITEBACK | OFFSET(4), "x"/* "LDREQ r_maskinword,[r_maskinptr,#4]! ; load more mask pixels (inc2)"*/); + TST(R(r_maskinword), IMM(1), "TST r_maskinword,#1"); + } + else + { + TST(R(r_maskinword), + ws->mask_bpc < 8 + ? IMM(1 << ws->mask_bpc) + : IMM(1) | IMMROR(32 - ws->mask_bpc), "TST r_maskinword,#1:SHL:mask_bpc"); + IFDEBUG(do_sprintf(a, "BEQ %s", l_masked->name);) + } + branch(ws, B | EQ, l_masked, a); + } + + comment(ws, "Fetch the source pixel after the current one"); + if (SOURCE_32_BIT) + ins(ws, LDR(R(r_pixel), R(r_inptr)) | OFFSET(4), "LDR r_pixel,[r_inptr,#4]"); + else if (SOURCE_16_BIT) + { + MOV(R(r_pixel), OP2R(R(r_inword)) | LSRI(16), "MOV r_pixel,r_inword,LSR #16"); + /* >>> Getting it into top 16bits harder in this case! */ + } + else + AND(R(r_pixel), R(r_in_pixmask), + OP2R(R(r_inword)) | LSRI(ws->in_bpc), "AND r_pixel,r_in_pixmask,r_inword,LSR #in_bpc" + " ; fetch the next pixel"); + return SOURCE_MASK; +} + +#ifdef DEBUG +static void add_ordered_dither_gun(asm_workspace *wp, workspace *ws, int bits_per_gun, int offset, char *gun) +#else +#define add_ordered_dither_gun(a,b,c,d,e) do_add_ordered_dither_gun(a,b,c,d) +static void do_add_ordered_dither_gun(asm_workspace *wp, workspace *ws, int bits_per_gun, int offset) +#endif +/* Do one gun of the ordered dither - entirely local to add_ordered_dither below + * Offset is the offset from bit 0 of the base of this field of the colour + */ +{ + int x = 32 - bits_per_gun - offset; /* amount to shift the colour field in question */ +#ifdef DEBUG + char a[256]; +#endif + + IFDEBUG(do_sprintf(a, "CMN r_oditheradd,r_pixel,LSL #%i %t40; will the %s value overflow?", x, gun);) + CMN(R(r_oditheradd), OP2R(R(r_pixel)) | LSLI(x), a); + + IFDEBUG(do_sprintf(a, "ADDCC r_pixel,r_pixel,r_oditheradd,LSR #%i %t40; if not, add.", x);) + ADD(R(r_pixel), R(r_pixel), CC | OP2R(R(r_oditheradd)) | LSRI(x), a); + UNUSED(wp); +} + +static void add_ordered_dither(asm_workspace *wp, workspace *ws, int bits_per_gun) +/* bits_per_gun is 5 or 8. The 32-bit RGB value in r_pixel should have + * r_oditheradd >> (32-bits_per_gun) added to each of R/G/B, except that these + * additions should be 'sticky' at 255 in each gun. + * + * The resulting values are just about to be truncated somewhat, so the lo + * bits of each answer do not matter much. Thus, if the value is currently + * 254 we never add, but this doesn't matter. + */ +{ + if (ws->odither) /* turn off for now */ + { + comment(ws, "Add current value for ordered dither"); + add_ordered_dither_gun(wp, ws, bits_per_gun, 2*bits_per_gun, "blue"); + add_ordered_dither_gun(wp, ws, bits_per_gun, 1*bits_per_gun, "green"); + add_ordered_dither_gun(wp, ws, bits_per_gun, 0, "red"); + newline(); + } +} + +static void translate_pixel(asm_workspace *wp, workspace *ws) +/* Translate r_pixel from being a source pixel, to being a destination pixel. */ +{ + int pixl2bpp = wp->save_inlog2bpp; + + if (PLOTMASK) + { + if ((ws->gcol & 7) == 2) /* AND plot action */ + { + MOV(R(r_pixel), OP2R(R(r_pixel)) | LSLI(31-(wp->BPP)), "MOV r_pixel, r_pixel, LSL 31-out_bpp ;a"); + ORR(R(r_pixel), R(r_pixel), IMM(2) | IMMROR(2), "ORR r_pixel,r_pixel,#&80000000 ;a"); + MOV(R(r_pixel), OP2R(R(r_pixel)) | ASRI(31-(wp->BPP)), "MOV r_pixel, r_pixel, ASR 31-out_bpp ;a"); + } + return; /* No more transformation necessary */ + } + + if (ws->odither) add_ordered_dither(wp, ws, pixl2bpp == 5 ? 8 : 5); /* do ordered dither */ + + comment(ws, "Perform any0 transformation necessary"); + if (wp->trns_palette != 0) + { + assert(pixl2bpp <= 3, ERROR_FATAL); + if (wp->BPP == 16) + { + ins(ws, LDR(R(r_pixel), R(r_table)) + | INDEX(R(r_pixel), 2), "LDR r_pixel,[r_table, r_pixel, LSL #2] ; 16bpp palette lookup"); + pixl2bpp = 4; + } + else + { + ins(ws, LDR(R(r_pixel), R(r_table)) + | INDEX(R(r_pixel), 3), "LDR r_pixel,[r_table, r_pixel, LSL #3] ; palette lookup"); + MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(8), "MOV r_pixel,r_pixel,LSR #8 ; BBGGRR00 to 00BBGGRR "); + pixl2bpp = 5; + } + } + + if (pixl2bpp == 5 && wp->BPP != 32 && !(wp->is_it_jpeg && (wp->dither_truecolour & 2) && (wp->BPP != 16))) /* all we can do is truncate to 4, as a first stage. */ + { + /* if (wp->BPP == 16 && ws->odither) add_ordered_dither(wp, ws, 8); */ + comment(ws, " Taken from munge32to16 fedcba9876543210 fedcba9876543210"); + comment(ws, " r_pixel = 00000000bbbbbbbb ggggggggrrrrrrrr"); + AND(R(r_temp1),R(r_pixel),IMM(0xf8) | IMMROR(16), "AND r_temp1,r_pixel,#&F80000 ; r_temp1 = 00000000bbbbb000 0000000000000000"); + MOV(R(r_temp2), OP2R(R(r_temp1)) | LSLI(7), "MOV r_temp2,r_temp1,LSL #7 ; r_temp2 = 0bbbbb0000000000 0000000000000000"); + AND(R(r_temp1), R(r_pixel), IMM(0xf8)|IMMROR(24), "AND r_temp1,r_pixel,#&f800 ; r_temp1 = 0000000000000000 ggggg00000000000"); + ORR(R(r_temp2),R(r_temp2),OP2R(R(r_temp1))|LSLI(10),"ORR r_temp2,r_temp2,r_temp1,LSL #10 ; r_temp2 = 0bbbbbggggg00000"); + AND(R(r_temp1), R(r_pixel), IMM(0xf8), "AND r_temp1,r_pixel,#&F8 ; r_temp1 = 0000000000000000 00000000rrrrr000"); + ORR(R(r_pixel),R(r_temp2),OP2R(R(r_temp1))|LSLI(13),"ORR r_pixel,r_temp2,r_temp1,LSL #13 ; r_pixel = 0bbbbbgggggrrrrr"); + MOV(R(r_pixel),OP2R(R(r_pixel)) | LSRI(16), "MOV r_pixel,r_pixel,LSR #16 ; result in bottom half"); + /* >>> check re keeping 16bit r_pixel at the top */ + pixl2bpp = 4; + } + + if (pixl2bpp == 4 && wp->BPP == 32) /* pad out to 32bpp */ + { + MOV(R(r_pixel),OP2R(R(r_pixel)) | LSLI(16), "MOV r_pixel,r_pixel,LSL #16 ; input in top half"); + /* >>> check re keeping 16bit r_pixel at the top */ + comment(ws, " ; fedcba9876543210 fedcba9876543210"); + comment(ws, " ; r_pixel = 0bbbbbgggggrrrrr"); + MOV(R(r_temp1), OP2R(R(r_pixel)) | LSRI(26), "MOV r_temp1,r_pixel,LSR #26 ; r_temp1 = 0bbbbb"); + MOV(R(r_temp2), OP2R(R(r_temp1)) | LSLI(19), "MOV r_temp2,r_temp1,LSL #19 ; r_temp2 = 0bbbbb000 0000000000000000"); + AND(R(r_temp1), R(r_pixel), IMM(0x3E) | IMMROR(12), "AND r_temp1,r_pixel,#&03E00000 ; r_temp1 = 000000ggggg00000"); + ORR(R(r_temp2), R(r_temp2),OP2R(R(r_temp1))|LSRI(10),"ORR r_temp2,r_temp2,r_temp1,LSR #10 ; r_temp2 = 0bbbbb000 ggggg00000000000"); + MOV(R(r_temp1), OP2R(R(r_pixel)) | LSLI(11), "MOV r_temp1,r_pixel,LSL #11 ; r_temp1 = rrrrr00000000000 0000000000000000"); + ORR(R(r_temp2), R(r_temp2),OP2R(R(r_temp1))|LSRI(24),"ORR r_temp2,r_temp2,r_temp1,LSR #24 ; r_temp2 = 0bbbbb000 ggggg000rrrrr000"); + comment(ws, "Now copy the top three bits of each colour component into the bottom three"); + comment(ws, " ; r_c1632 = 0000000011100000 1110000011100000"); + AND(R(r_temp1), R(r_temp2), OP2R(R(r_c1632)), "AND r_temp1,r_temp2,r_c1632 ; r_temp1 = 00000000bbb00000 ggg00000rrr00000"); + ORR(R(r_pixel), R(r_temp2),OP2R(R(r_temp1))|LSRI(5), "ORR r_pixel,r_temp2,r_temp1,LSR #5 ; r_pixel = 00000000bbbbbbbb ggggggggrrrrrrrr"); + pixl2bpp = 5; + } + + /* Translation table lookup */ + if (wp->ColourTTR != 0) + { + comment(ws, "We have a translation table."); + if (ws->out_l2bpp <= 3) /* ie BPP <= 8 */ + { + assert(pixl2bpp <= 4, ERROR_FATAL); /* up to 32K entries in byte table */ + /* if (pixl2bpp == 4 && ws->odither) add_ordered_dither(wp, ws, 5); */ + ins(ws, LDRB(R(r_pixel), R(r_table)) | INDEX(R(r_pixel), 0), "LDRB r_pixel,[r_table, r_pixel] ; byte table lookup"); + } + else + { + assert(pixl2bpp <= 3, ERROR_FATAL); /* up to 256 entries in word table */ + ins(ws, LDR(R(r_pixel), R(r_table)) | INDEX(R(r_pixel), 2), "LDR r_pixel,[r_table, r_pixel, LSL #2] ; word table lookup"); + /* >>> with 16bpp that could be in the top half? Not sure... */ + } + pixl2bpp = ws->out_l2bpp; /* we've finished */ + } + else if (wp->is_it_jpeg && (wp->dither_truecolour & 2)) + { + /* bottom n bits of word contains colour number we want... */ + pixl2bpp = ws->out_l2bpp; /* we've finished */ + comment(ws, "JPEG error diffusion should have done all the work!"); + } + else if (pixl2bpp == 4 && ws->out_l2bpp < 4) + { + /* Hack for JPEG data in RISC OS 3 + * r_pixel is a 16bpp colour value at the moment, but we have no lookup table for the 16->1/2/4/8 transition + * For 1/2/4bpp we use the top bits of red as the grey level. From a JPEG source this will work + * fine, as the JPEG will have noticed that the output is mono and simply produced greyscale + * output. + */ + comment(ws, "Colour truncation without lookup table.\n"); + if (ws->out_l2bpp == 0) /* 1bpp */ + { + comment(ws, "Creating 0 or 1 from 0bbbbbgg gggrrrrr"); + TST(R(r_pixel), IMM(16), "TST r_pixel,#16 ; test hi bit of R"); + MOV(R(r_pixel), IMM(1), "MOV r_pixel,#1 ; black"); + MOV(R(r_pixel), NE | IMM(0), "MOVNE r_pixel,#0 ; white"); + pixl2bpp = 0; + } + else if (ws->out_l2bpp == 1) /* 2bpp */ + { + comment(ws, "Creating 0,1,2 or 3 from 0bbbbbgg gggrrrrr"); + MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(3), "MOV r_pixel,r_pixel,LSR #3 ; hi 2 bits of R"); + AND(R(r_pixel), R(r_pixel), IMM(3), "AND r_pixel,r_pixel,#3 ; mask off everything else"); + RSB(R(r_pixel), R(r_pixel), IMM(3), "RSB r_pixel,r_pixel,#3 ; change to 0->white, 3->black"); + pixl2bpp = 1; + } + if (ws->out_l2bpp == 2) /* 4bpp */ + { + comment(ws, "Creating wimp colour in 0..7 from 0bbbbbgg gggrrrrr"); + MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(2), "MOV r_pixel,r_pixel,LSR #2 ; hi 3 bits of R"); + AND(R(r_pixel), R(r_pixel), IMM(7), "AND r_pixel,r_pixel,#7 ; mask off everything else"); + RSB(R(r_pixel), R(r_pixel), IMM(7), "RSB r_pixel,r_pixel,#7 ; change to 0->white, 7->black"); + pixl2bpp = 2; + } + else if (ws->out_l2bpp == 3) /* 8bpp from 16bpp true colour, no lookup table - only for use on RISC OS 3.10 */ + { + /* Get the top two bits of each gun. The organisation is: + * bit 0 - tint 0 + * bit 1 - tint 1 + * bit 2 - red 2 + * bit 3 - blue 2 + * bit 4 - red 3 (high) + * bit 5 - green 2 + * bit 6 - green 3 (high) + * bit 7 - blue 3 (high) + */ + comment(ws, "Creating bggrbrtt from 0bbbbbgg gggrrrrr"); + + /* Making the tint - the average of the lo 3 bits of RGB isn't a bad approximation. We make this + * by adding them all up, multiplying by 3, and dividing by 8. We involve the lo bits in the approximation + * as well, in case they produce a useful carry. + */ + AND(R(r_temp1), R(r_pixel), IMM(0x1C) | IMMROR(24), "AND r_temp1,r_pixel,#&1C00 ; bottom 3 bits of B"); + MOV(R(r_temp2), OP2R(R(r_temp1)) | LSRI(10), "MOV r_temp2,r_temp1,LSR #10 ; at bottom of temp2"); + AND(R(r_temp1), R(r_pixel), IMM(0xE0), "AND r_temp1,r_pixel,#&E0 ; bottom 3 bits of G"); + ADD(R(r_temp2), R(r_temp2), OP2R(R(r_temp1)) | LSRI(5), "ADD r_temp2,r_temp2,r_temp1,LSR #5 ; add to bottom B bits"); + AND(R(r_temp1), R(r_pixel), IMM(0x07), "AND r_temp1,r_pixel,#&07 ; bottom 3 bits of R"); + ADD(R(r_temp2), R(r_temp2), OP2R(R(r_temp1)), "ADD r_temp2,r_temp2,r_temp1 ; add to bottom B+G bits"); + ADD(R(r_temp2), R(r_temp2), OP2R(R(r_temp2)) | LSLI(2), "ADD r_temp2,r_temp2,r_temp2,LSL #2 ; (lo R+G+B)*5 (< 128)"); + + /* The hi bits are just done by extracting from the 16bpp value. This takes ages! */ + MOV(R(r_temp1), IMM(0), "MOV r_temp1,#0 ; building result pixel for hi bits"); + + /* Top bits of B */ + TST(R(r_pixel), IMM(64) | IMMROR(24), "TST r_pixel,#&4000 ; test top bit of B"); + ORR(R(r_temp1), R(r_temp1), NE | IMM(128), "ORRNE r_temp1,r_temp1,#128 ; bit 7 = top bit of B"); + TST(R(r_pixel), IMM(32) | IMMROR(24), "TST r_pixel,#&2000 ; test next bit of B"); + ORR(R(r_temp1), R(r_temp1), NE | IMM(8), "ORRNE r_temp1,r_temp1,#8 ; bit 3 = next bit of B"); + + /* Top bits of G */ + TST(R(r_pixel), IMM(2) | IMMROR(24), "TST r_pixel,#&200 ; test top bit of G"); + ORR(R(r_temp1), R(r_temp1), NE | IMM(64), "ORRNE r_temp1,r_temp1,#64 ; bit 6 = top bit of G"); + TST(R(r_pixel), IMM(1) | IMMROR(24), "TST r_pixel,#&100 ; test next bit of G"); + ORR(R(r_temp1), R(r_temp1), NE | IMM(32), "ORRNE r_temp1,r_temp1,#32 ; bit 5 = next bit of G"); + + /* Top bits of R */ + TST(R(r_pixel), IMM(16), "TST r_pixel,#&10 ; test top bit of R"); + ORR(R(r_temp1), R(r_temp1), NE | IMM(16), "ORRNE r_temp1,r_temp1,#16 ; bit 4 = top bit of R"); + TST(R(r_pixel), IMM(8), "TST r_pixel,#&08 ; test next bit of R"); + ORR(R(r_temp1), R(r_temp1), NE | IMM(4), "ORRNE r_temp1,r_temp1,#4 ; bit 2 = next bit of R"); + + ORR(R(r_pixel), R(r_temp1), OP2R(R(r_temp2)) | LSRI(5), "ORR r_pixel,r_temp1,r_temp2,LSR #5 ; combine hi bits and tint"); + + pixl2bpp = 3; + } + } + + assert(pixl2bpp == ws->out_l2bpp, ERROR_FATAL); /* If this hasn't happened, we haven't completed the transformation. */ + + if (((ws->gcol & 7) == 2) && (pixl2bpp != 5)) /* AND plot action which did something stupid for 32bpp (GPS)*/ + { + MOV(R(r_pixel), OP2R(R(r_pixel)) | LSLI(31-(wp->BPP)), "MOV r_pixel, r_pixel, LSL 31-out_bpp"); + ORR(R(r_pixel), R(r_pixel), IMM(2) | IMMROR(2), "ORR r_pixel,r_pixel,#&80000000 "); + MOV(R(r_pixel), OP2R(R(r_pixel)) | ASRI(31-(wp->BPP)), "MOV r_pixel, r_pixel, ASR 31-out_bpp"); + } + + comment(ws, "r_pixel is now a destination pixel."); + + if (DPIXEL_OUTPUT) + ORR(R(r_pixel), R(r_pixel), OP2R(R(r_pixel)) | LSLI(wp->BPP), "ORR r_pixel,r_pixel,r_pixel,LSL #out_bpp:SHR:1 ; double pixel output"); + + newline(); +} + +static void save_pixel(asm_workspace *wp, workspace *ws) +/* Save the new pixel into the current destination pixel. */ +/* Recall GCOL actions: + * 0 -> overwrite old pixel + * 1 -> OR with old pixel + * 2 -> AND with old pixel + * 3 -> EOR with old pixel + * 4 -> invert old pixel + * 5 -> do nothing + * 6 -> AND old pixel with NOT of new pixel + * 7 -> OR old pixel with NOT of new pixel + */ +{ + comment(ws, "Put the pixel in the output stream."); + if (PLOTMASK) + { + if (DEST_32_BIT) + { + ins(ws, LDR(R(r_pixel), R(r_outptr)) | OFFSET(0), "LDR r_pixel,[r_outptr] ;bkah"); + ORR(R(r_pixel), R(r_inword), OP2R(R(r_pixel)), "ORR r_pixel,r_inword,r_pixel ; 1OR gcol action"); + EOR(R(r_pixel), R(r_bgcolour), OP2R(R(r_pixel)), "EOR r_pixel,r_bgcolour,r_pixel ; 1EOR gcol action"); + ins(ws, STR(R(r_pixel), R(r_outptr)) | OFFSET(0), "STR r_pixel,[r_outptr] ;blaq5h"); + } + else + { + if (DEST_16_BIT) + { + MOV(R(r_pixel), OP2R(R(r_inword)) | LSLI(16), "MOV r_pixel,r_inword,LSL #16 ; fetch 16 bit ECF pattern pixel44 99"); + MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(16), "MOV r_pixel,r_pixel,LSR #16 ; 4444444"); + ORR(R(r_outword), R(r_outword), OP2R(R(r_pixel)), "ORR r_outword,r_outword,r_pixel ; ECF OR mask44 99"); + MOV(R(r_pixel), OP2R(R(r_bgcolour)) | LSLI(16), "MOV r_pixel,r_bgcolour,LSL #16 ; fetch 16 bit ECF pattern pixel 4499"); + MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(16), "MOV r_pixel,r_pixel,LSR #16 ;449"); + EOR(R(r_outword), R(r_outword), OP2R(R(r_pixel)), "EOR r_outword,r_outword,r_pixel ; ECF EOR mask 4499"); + } + else + { + AND(R(r_pixel), R(r_inword), IMM(ws->out_pixmask), "1AND r_pixel,r_inword,#out_pixmask ; blah blah"); + ORR(R(r_outword), R(r_outword), OP2R(R(r_pixel)), "ORR r_outword,r_outword,r_pixel ; ECF OR mask"); + AND(R(r_pixel), R(r_bgcolour), IMM(ws->out_pixmask), "1AND r_pixel,r_bgcolour,#out_pixmask jthjg"); + EOR(R(r_outword), R(r_outword), OP2R(R(r_pixel)), "EOR r_outword,r_outword,r_pixel ; ECF EOR mask"); + } + } + } + else + { + if (DEST_32_BIT) + { + if (ws->gcol != 0) /* Not just a simple store operation */ + { + ins(ws, LDR(R(r_temp1), R(r_outptr)) | OFFSET(0), "LDR r_temp1,[r_outptr]"); + switch(ws->gcol) + { + case 7: MVN(R(r_pixel), OP2R(R(r_pixel)), "MVN r_pixel,r_pixel ; OR with neg action"); + case 1: ORR(R(r_temp1), R(r_pixel), OP2R(R(r_temp1)), "ORR r_temp1,r_pixel,r_temp1 ; OR gcol action"); break; + case 6: MVN(R(r_pixel), OP2R(R(r_pixel)), "MVN r_pixel,r_pixel ; AND with neg action"); + case 2: AND(R(r_temp1), R(r_pixel), OP2R(R(r_temp1)), "AND r_temp1,r_pixel,r_temp1 ; AND gcol action"); break; + case 3: EOR(R(r_temp1), R(r_pixel), OP2R(R(r_temp1)), "EOR r_temp1,r_pixel,r_temp1 ; EOR gcol action"); break; + case 4: MVN(R(r_temp1), OP2R(R(r_temp1)), "MVN r_temp1,r_temp1 ; neg gcol action"); break; + /* case 5: is a NOP */ + } + ins(ws, STR(R(r_temp1), R(r_outptr)) | OFFSET(0), "STR r_temp1,[r_outptr]"); + if ((ws->gcol == 7) || (ws->gcol == 6)) /* put r_pixel back as we found it */ + MVN(R(r_pixel), OP2R(R(r_pixel)), "1MVN r_pixel,r_pixel ; Put r_pixel back"); + } + else + { + ins(ws, STR(R(r_pixel), R(r_outptr)) | OFFSET(0), "STR r_pixel,[r_outptr]"); + } + } + else + { + if (ws->gcol == 6 || ws->gcol == 7) /* and/or with NOT of incoming pixel */ + { + if (DESTD_16_BIT) + { + EOR(R(r_pixel), R(r_pixel), IMM(255), "1EOR r_pixel,r_pixel,#0x00ff ; act with NOT of input pixel"); + EOR(R(r_pixel), R(r_pixel), IMM(255) | IMMROR(24), "1EOR r_pixel,r_pixel,#0xff00"); + } + else + EOR(R(r_pixel), R(r_pixel), IMM(ws->out_dpixmask), "1EOR r_pixel,r_pixel,#out_dpixmask ; act with NOT of input pixel"); + } + + switch (ws->gcol) + { + case 0: + if (SOURCE_MASK) /* if no mask, the pixels are clear already */ + { + if (DESTD_16_BIT) + { + BIC(R(r_outword), R(r_outword), IMM(255), "BIC r_outword,r_outword,#0x00ff"); + BIC(R(r_outword), R(r_outword), IMM(255) | IMMROR(24), "BIC r_outword,r_outword,#0xff00"); + } + else + BIC(R(r_outword), R(r_outword), IMM(ws->out_dpixmask), "BIC r_outword,r_outword,#out_dpixmask"); + } + /* fall through */ + case 7: + case 1: ORR(R(r_outword), R(r_outword), OP2R(R(r_pixel)), "ORR r_outword,r_outword,r_pixel ; gcol action"); break; + case 6: + case 2: AND(R(r_outword), R(r_outword), OP2R(R(r_pixel)), "AND r_outword,r_outword,r_pixel ; AND gcol action"); break; + case 3: EOR(R(r_outword), R(r_outword), OP2R(R(r_pixel)), "EOR r_outword,r_outword,r_pixel ; EOR gcol action"); break; + case 4: if (DESTD_16_BIT) + { + EOR(R(r_outword), R(r_outword), IMM(255), "EOR r_outword,r_outword,#0x00ff ; negate existing pixel"); + EOR(R(r_outword), R(r_outword), IMM(255) | IMMROR(24),"EOR r_outword,r_outword,#0xff00"); + } + else + EOR(R(r_outword), R(r_outword), IMM(ws->out_dpixmask),"EOR r_outword,r_outword,#out_dpixmask ; negate existing pixel"); + break; + case 5: comment(ws, "no GCOL action"); break; + } + if (ws->gcol == 6 || ws->gcol == 7) /* put r_pixel back as we found it in case scaling > 1:1! */ + { + if (DESTD_16_BIT) + { + EOR(R(r_pixel), R(r_pixel), IMM(255), "EOR r_pixel,r_pixel,#0x00ff ; put r_pixel back as it was"); + EOR(R(r_pixel), R(r_pixel), IMM(255) | IMMROR(24), "EOR r_pixel,r_pixel,#0xff00 ; put r_pixel back as it was"); + } + else + EOR(R(r_pixel), R(r_pixel), IMM(ws->out_dpixmask), "EOR r_pixel,r_pixel,#out_dpixmask ; put r_pixel back as it was"); + } + } + } +} + +static void save_pixel_opt(asm_workspace *wp, workspace *ws) +/* Save pixel for use by optimised >5 scaling code. */ +{ + if (DESTD_16_BIT) + { + BIC(R(r_outword), R(r_outword), IMM(255), "3BIC r_outword,r_outword,#0x00ff"); + BIC(R(r_outword), R(r_outword), IMM(255) | IMMROR(24), "4BIC r_outword,r_outword,#0xff00"); + } + else + { + BIC(R(r_outword), R(r_outword), IMM(ws->out_dpixmask), "5BIC r_outword,r_outword,#out_dpixmask"); + } + ORR(R(r_outword), R(r_outword), OP2R(R(r_pixel)), "6ORR r_outword,r_outword,r_pixel ; gcol action"); +} + +static void save_pixel2(asm_workspace *wp, workspace *ws) +/* Save the new pixel into the pixel after the current destination pixel. */ +{ + comment(ws, "Put the pixel in the output stream, one after the 'current' pixel."); + + /* Current limitation */ + assert(ws->gcol == 0, ERROR_FATAL); + + if (DEST_32_BIT) + { + ins(ws, STR(R(r_pixel), R(r_outptr)) | OFFSET(4), "STR r_pixel,[r_outptr,#4]"); + } + else + { + if (SOURCE_MASK) + { + if (wp->BPC == 16) /* DEST_16_BIT but includes double-pixel 256-colour mode 10 too */ + { + BIC(R(r_outword), R(r_outword), IMM(255) | IMMROR(16), "BIC r_outword,r_outword,#0x00ff0000"); + BIC(R(r_outword), R(r_outword), IMM(255) | IMMROR(8), "BIC r_outword,r_outword,#0xff000000"); + } + else + BIC(R(r_outword), R(r_outword), + wp->BPC == 1 + ? IMM(2) /* IMMROR arg must be an even number */ + : IMM(ws->out_dpixmask) | IMMROR(32 - wp->BPC), "BIC r_outword,r_outword,#out_dpixmask:SHL:out_bpc"); + } + ORR(R(r_outword),R(r_outword), + OP2R(R(r_pixel)) | LSLI(wp->BPC), "ORR r_outword,r_outword,r_pixel,LSL #out_bpc"); + } +} + +/************************************************************************** +* * +* Bitblit: Advancing the current pixel. * +* * +**************************************************************************/ + +static void fetch_pixel_inc(asm_workspace *wp, workspace *ws) +/* Increment the pointer to the source pixel */ +{ + comment(ws, "Advance source pointer"); + + if (!PLOTMASK) /* The ECF pattern remains aligned to the destination */ + { + if (SOURCE_32_BIT) + { + ADD(R(r_inptr), R(r_inptr), IMM(4), "ADD r_inptr,r_inptr,#4"); + } + else + { + MOV(R(r_inword), OP2R(R(r_inword)) | RORI(ws->in_bpc), "MOV r_inword,r_inword,ROR #in_bpc"); + if (SOURCE_MASK & !(SOURCE_BPPMASK)) + MOV(R(r_maskinword), OP2R(R(r_maskinword)) | + RORI(ws->in_bpc), "MOV r_maskinword,r_maskinword,ROR #in_bpc"); + SUB(R(r_inshift), R(r_inshift), + S | IMM(ws->in_bpc*2) | IMMROR(6), "SUBS r_inshift,r_inshift,#in_bpc:SHL:27 ; auto-resets itself to 0"); + ins(ws, LDR(R(r_inword), R(r_inptr)) + | EQ | WRITEBACK | OFFSET(4), "LDREQ r_inword,[r_inptr,#4]!"); + } + } + + if (SOURCE_MASK) + { + if (SOURCE_BPPMASK || PLOTMASK) + { + MOV(R(r_maskinword), OP2R(R(r_maskinword)) + | RORI(ws->mask_bpp), "MOV r_maskinword,r_maskinword,ROR #mask_bpp"); + SUB(R(r_maskinshift),R(r_maskinshift), + S | IMM(ws->mask_bpp*2) | IMMROR(6),"SUBS r_maskinshift,r_maskinshift,#mask_bpp:SHL:27 ; auto-resets itself to 0"); + ins(ws, LDR(R(r_maskinword), R(r_maskinptr)) + | EQ | WRITEBACK | OFFSET(4), "LDREQ r_maskinword,[r_maskinptr,#4]!"); + } + else + { + assert(!SOURCE_32_BIT, ERROR_FATAL); + ins(ws, LDR(R(r_maskinword), + R(r_inptr)) | EQ | INDEX(R(r_masko), 0), "LDREQ r_maskinword,[r_inptr,r_masko]"); + } + } +} + +static void fetch_pixel_inc2(asm_workspace *wp, workspace *ws) +/* Increment the pointer to the source pixel by two - only used in the 2-at-a-time + * optimised loop + */ +{ + comment(ws, "Advance source pointer by two pixels"); + if (SOURCE_32_BIT) + { + ADD(R(r_inptr), R(r_inptr), IMM(8), "ADD r_inptr,r_inptr,#8 ; past 2 32-bit pixels"); + } + else if (SOURCED_16_BIT) + { + /* Two pixels per word - assured of loading a new word */ + ins(ws, LDR(R(r_inword), R(r_inptr)) + | WRITEBACK | OFFSET(4), "LDR r_inword,[r_inptr,#4]! ; past 2 16-bit pixels"); + } + else + { + MOV(R(r_inword), OP2R(R(r_inword)) | RORI(ws->in_bpc*2), "MOV r_inword,r_inword,ROR #in_bpc*2"); + if (SOURCE_MASK & !(SOURCE_BPPMASK)) + MOV(R(r_maskinword), OP2R(R(r_maskinword)) | + RORI(ws->in_bpc*2), "MOV r_maskinword,r_maskinword,ROR #in_bpc*2"); + SUB(R(r_inshift), R(r_inshift), + S | IMM(ws->in_bpc) | IMMROR(4), "SUBS r_inshift,r_inshift,#in_bpc:SHL:28 ; auto-resets itself to 0"); + ins(ws, LDR(R(r_inword), R(r_inptr)) + | EQ | WRITEBACK | OFFSET(4), "LDREQ r_inword,[r_inptr,#4]! ; load more input pixels (inc2)"); + } + + if (SOURCE_MASK) + { + if (SOURCE_BPPMASK) + { +#if 0 + MOV(R(r_maskinword), OP2R(R(r_maskinword)) + | RORI(ws->mask_bpp), "x"/*MOV r_maskinword,r_maskinword,ROR #mask_bpp"*/); + SUB(R(r_maskinshift),R(r_maskinshift), + S | IMM(ws->mask_bpp*2) | IMMROR(6), "x"/*SUBS r_maskinshift,r_maskinshift,#mask_bpp:SHL:27"*/); + ins(ws, LDR(R(r_maskinword), R(r_maskinptr)) + | EQ | WRITEBACK | OFFSET(4), "x"/* "LDREQ r_maskinword,[r_maskinptr,#4]! ; load more mask pixels (inc2)"*/); +#endif + MOV(R(r_maskinword), OP2R(R(r_maskinword)) + | RORI(ws->mask_bpp), "MOV r_maskinword,r_maskinword,ROR #mask_bpp"); + SUB(R(r_maskinshift),R(r_maskinshift), + S | IMM(ws->mask_bpp*2) | IMMROR(6),"SUBS r_maskinshift,r_maskinshift,#mask_bpp:SHL:27"); + ins(ws, LDR(R(r_maskinword), R(r_maskinptr)) + | EQ | WRITEBACK | OFFSET(4), "LDREQ r_maskinword,[r_maskinptr,#4]! ; load more mask pixels (inc2)"); + } + else + { + assert(!SOURCE_32_BIT, ERROR_FATAL); + ins(ws, LDR(R(r_maskinword), R(r_inptr)) + | EQ | INDEX(R(r_masko), 0), "LDREQ r_maskinword,[r_inptr,r_masko] ; load more mask pixels (inc2)"); + } + } +} + +static void odither_inc(asm_workspace *wp, workspace *ws, int xy) +/* Call every output pixel - alternates the ordered dither addition value + * xy == 0 for x, 1 for y + */ +{ + if (ws->odither) + EOR(R(r_oditheradd),R(r_oditheradd), IMM(1 << (ws->odither - xy)) | IMMROR(8), + xy == 0 ? "EOR r_oditheradd,r_oditheradd,#odither_eorvalue ; alternate dither offset" + : "EOR r_oditheradd,r_oditheradd,#odither_eorvalue:SHR:1 ; alternate dither offset"); + UNUSED(wp); +} + +#if 1 +static void skip_current_output_words(asm_workspace *wp, workspace *ws) +/* Skip over masked out words. r_xcount = output pixels to skip + * r_temp1 = pixels left in current word. + */ +{ + comment(ws, "4Skipping masked words."); + if (DEST_32_BIT) + { + ADD(R(r_outptr), R(r_outptr), R(r_xcount) | LSLI(2), "4~ADD r_outptr,r_outptr,r_xcount,LSL #2 ; skip 4*pixels bytes"); + MOV(R(r_xcount), IMM(0), "41MOV r_xcount,#0"); + } + else + { + SUB(R(r_xcount), R(r_xcount), OP2R(R(r_temp1)), "4~SUB r_xcount, r_xcount, r_temp1"); + MOV(R(r_temp1), OP2R(R(r_temp1)) | LSLI(ws->out_l2bpc), "4~MOV r_temp1, t_temp1, LSL #out_log2bpc"); + MOV(R(r_outword), OP2R(R(r_outword)) | RORR(R(r_temp1)), "4~MOV r_outword,r_outword,ROR r_temp1"); + ins(ws, STR(R(r_outword), R(r_outptr)) | POSTINC(4), "4~STR r_outword,[r_outptr],#4"); + MOV(R(r_outshift), IMM(0), "4~MOV r_outshift, #0"); + + MOV(R(r_temp1), OP2R(R(r_xcount)) | S |LSRI(ws->out_l2ppw), "4~~MOVS r_temp1,r_xcount,LSR #out_log2ppw ; whole words to skip"); + ADD(R(r_outptr), R(r_outptr), NE | R(r_temp1) | LSLI(2), "4~ADDNE r_outptr,r_outptr,r_temp1,LSL #2 ; skip 4*pixels bytes"); + + ins(ws, LDR(R(r_outword), R(r_outptr)) | OFFSET(0), "4~~LDR r_outword,[r_outptr]"); + SUB(R(r_xcount), R(r_xcount), + OP2R(R(r_temp1)) | LSLI(ws->out_l2ppw), "4~SUB r_xcount, r_xcount, r_temp1 LSL #out_log2ppw ; pixels left to skip"); + } +} + +static void skip_some_pixels(asm_workspace *wp, workspace *ws) +/* Adjust outword and outshift back to start */ +{ + MOV(R(r_temp1), OP2R(R(r_xcount)) | LSLI(ws->out_l2bpc), "2~~MOV r_temp1, r_xcount, LSL #out_log2bpc"); + MOV(R(r_outword), OP2R(R(r_outword)) | RORR(R(r_temp1)), "2~MOV r_outword,r_outword,ROR r_temp1"); + SUB(R(r_outshift), R(r_outshift), + OP2R(R(r_temp1)) | LSLI(27), "2~~SUB r_outshift,r_outshift,r_temp1,SHL #27"); + MOV(R(r_xcount), IMM(0), "31MOV r_xcount,#0"); + UNUSED(wp); +} +#endif + +static void save_pixel_inc(asm_workspace *wp, workspace *ws) +/* Increment the pointer to the destination pixel */ +{ + comment(ws, "Advance destination pointer"); + if (DEST_32_BIT) + { + ADD(R(r_outptr), R(r_outptr), IMM(4), "ADD r_outptr,r_outptr,#4 323232"); + if (PLOTMASK) + { +#if 0 + EOR(R(r_ecfindex), R(r_ecfindex), IMM(4), "EOR r_ecfindex,r_ecfindex,#4 ; either 0 or 4323232"); +#endif +/* ins(ws, LDR(R(r_inword), R(r_inptr)) + | INDEX(R(r_ecfindex), 0), "LDR r_inword,[r_inptr,r_ecfindex] ; load next word of ECF 32323"); + ADD(R(r_ecfindex), R(r_ecfindex), + IMM(4), "ADD r_ecfindex,r_ecfindex,#4 132323"); + ins(ws, LDR(R(r_bgcolour), R(r_inptr)) + | INDEX(R(r_ecfindex), 0), "LDR r_bgcolour,[r_inptr,r_ecfindex] ; load next EOR word of ECF123232"); + SUB(R(r_ecfindex), R(r_ecfindex), + IMM(4), "SUB r_ecfindex,r_ecfindex,#4 132323"); +*/ } + } + else + { + MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC), "MOV r_outword,r_outword,ROR #out_bpc 545454"); + if (PLOTMASK) + { + MOV(R(r_inword), OP2R(R(r_inword)) | RORI(wp->BPC), "MOV r_inword,r_inword,ROR #out_bpc ; advance ECF pattern 5"); + MOV(R(r_bgcolour), OP2R(R(r_bgcolour)) | RORI(wp->BPC), "MOV r_bgcolour,r_bgcolour,ROR #out_bpc ; advance ECF eeyore pattern 5"); + } + SUB(R(r_outshift), R(r_outshift), + S | IMM(wp->BPC*2) | IMMROR(6), "SUBS r_outshift,r_outshift,#out_bpc:SHL:27 5"); + ins(ws, STR(R(r_outword), R(r_outptr)) | EQ | POSTINC(4),"STREQ r_outword,[r_outptr],#4 4"); + if (ws->gcol == 0 && !SOURCE_MASK && !PLOTMASK) + MOV(R(r_outword), EQ | IMM(0), "MOVEQ r_outword,#0 ; setting pixels and no mask 4"); + else + ins(ws, LDR(R(r_outword), R(r_outptr)) | EQ | OFFSET(0), "LDREQ r_outword,[r_outptr] 4"); + /* If entirely replacing pixels, no need to fetch the old ones. + * The last word has to be patched up carefully, see x_loop. + */ + if (PLOTMASK) + { +#if 0 + EOR(R(r_ecfindex), R(r_ecfindex), EQ | IMM(4), "EOREQ r_ecfindex,r_ecfindex,#4 ; either 0 or 4"); +#endif +#if 0 + ins(ws, LDR(R(r_inword), R(r_inptr)) + | INDEX(R(r_ecfindex) | EQ, 0), "LDREQ r_inword,[r_inptr,r_ecfindex] ; load next word of ECF 1"); + ADD(R(r_ecfindex), R(r_ecfindex), + IMM(4) | EQ, "ADDEQ r_ecfindex,r_ecfindex,#4 2"); + ins(ws, LDR(R(r_bgcolour), R(r_inptr)) + | INDEX(R(r_ecfindex) | EQ, 0), "LDREQ r_bgcolour,[r_inptr,r_ecfindex] ; load next EOR word of ECF2"); + SUB(R(r_ecfindex), R(r_ecfindex), + IMM(4) | EQ, "SUBEQ r_ecfindex,r_ecfindex,#4 2"); +#endif + } + } + odither_inc(wp, ws, 0); +} + +static void save_pixel_inc2(asm_workspace *wp, workspace *ws) +/* Increment the pointer to the destination pixel by two. You are assured that + * a word fetch won't be necessary after the first of these. Only used in the + * optimised 2-at-a-time inner loop. You are assured that gcol==0. + */ +{ + comment(ws, "Advance destination pointer by two pixels"); + if (DEST_32_BIT) + ADD(R(r_outptr), R(r_outptr), IMM(8), "ADD r_outptr,r_outptr,#8"); + else if (DESTD_16_BIT) + { + /* Two pixels per word - assured of saving a word, assured that gcol==0 and !SOURCE_MASK*/ + ins(ws, STR(R(r_outword), R(r_outptr)) | POSTINC(4), "STR r_outword,[r_outptr],#4 ; store two pixels"); + if (!SOURCE_MASK) + MOV(R(r_outword), IMM(0), "MOV r_outword,#0 ; setting pixels and no mask"); + else + ins(ws, LDR(R(r_outword), R(r_outptr)) | OFFSET(0), "LDR r_outword,[r_outptr] ; load dest data (in case of mask)"); + } + else + { + MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC*2), "MOV r_outword,r_outword,ROR #out_bpc*2"); + SUB(R(r_outshift), R(r_outshift), + S | IMM(wp->BPC) | IMMROR(4), "SUBS r_outshift,r_outshift,#out_bpc:SHL:28"); + ins(ws, STR(R(r_outword), R(r_outptr)) | EQ | POSTINC(4),"STREQ r_outword,[r_outptr],#4 ; store pixels (inc2)"); + if (!SOURCE_MASK) + MOV(R(r_outword), EQ | IMM(0), "MOVEQ r_outword,#0 ; setting pixels and no mask (inc2)"); + else + ins(ws, LDR(R(r_outword), R(r_outptr)) | EQ | OFFSET(0), "LDREQ r_outword,[r_outptr] ; get dest data (in case of mask)"); + /* If entirely replacing pixels, no need to fetch the old ones. + * The last word has to be patched up carefully, see x_loop. + */ + } + odither_inc(wp, ws, 0); /* assume this has also been called once after the first pixel has been translated */ +} + +static void plot_current_output_words(asm_workspace *wp, workspace *ws, int scale) +/* plot multiple words of one pixel. r_xcount = output pixels to skip + * r_temp1 = pixels left in current word. + * r_pixel = pixel to output. + */ +{ + int loop; + comment(ws, "2Optimised plotting of scaled sprite."); + if (DEST_32_BIT) + { +#if 1 + ins(ws, STR(R(r_pixel), R(r_outptr)) | POSTINC(4), "32STR r_pixel,[r_outptr],#4"); + SUB(R(r_xcount), R(r_xcount), + S | IMM(1), "14SUBS r_xcount,r_xcount,#1"); + if (scale < 21) + { + for (loop = 1;loop<scale;loop++) + { + ins(ws, STR(R(r_pixel), R(r_outptr)) | NE | POSTINC(4), "32STRNE r_pixel,[r_outptr],#4"); + SUB(R(r_xcount), R(r_xcount), + S | NE | IMM(1), "14SUBNES r_xcount,r_xcount,#1"); + } + } + else + { + CMP(R(r_xcount), IMM(10), "CMP r_xcount, #10"); + branch(ws, B | LE, L(plot_loop1b), "BLE plot_loop1b"); + DEFINE_LABEL(plot_loop1a, "loop for every ten pixels") + for (loop = 0;loop<10;loop++) + { + ins(ws, STR(R(r_pixel), R(r_outptr)) | POSTINC(4), "32STR r_pixel,[r_outptr],#4"); + } + SUB(R(r_xcount), R(r_xcount), + IMM(10), "14SUB r_xcount,r_xcount,#10"); + CMP(R(r_xcount), IMM(10), "CMP r_xcount, #10"); + branch(ws, B | GT, L(plot_loop1a), "BGT plot_loop1a"); + DEFINE_LABEL(plot_loop1b, "branch here when LH side obscured") + CMP(R(r_xcount), IMM(0), "CMP r_xcount, #0"); + for (loop = 0;loop<10;loop++) + { + ins(ws, STR(R(r_pixel), R(r_outptr)) | NE | POSTINC(4), "4STRNE r_pixel,[r_outptr],#4"); + SUB(R(r_xcount), R(r_xcount), + S | NE | IMM(1), "16SUBNES r_xcount,r_xcount,#1"); + } + } +#else + for (loop = 0;loop<scale;loop++) + ins(ws, STR(R(r_pixel), R(r_outptr)) | POSTINC(4), "32STR r_outword,[r_outptr],#4"); +#endif + } + else + { + SUB(R(r_xcount), R(r_xcount), OP2R(R(r_temp1)), "52SUB r_xcount, r_xcount, r_temp1"); + + DEFINE_LABEL(plot_loop1, "1???") + save_pixel_opt(wp, ws); + MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC), "015MOV r_outword,r_outword,ROR #out_bpc"); + SUB(R(r_outshift), R(r_outshift), + S | IMM(wp->BPC*2) | IMMROR(6), "7SUBS r_outshift,r_outshift,#out_bpc:SHL:27"); + branch(ws, B | EQ, L(plot_loop1a), "BEQ plot_loop1a"); + + save_pixel_opt(wp, ws); + MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC), "115MOV r_outword,r_outword,ROR #out_bpc"); + SUB(R(r_outshift), R(r_outshift), + S | IMM(wp->BPC*2) | IMMROR(6), "17SUBS r_outshift,r_outshift,#out_bpc:SHL:27"); + branch(ws, B | EQ, L(plot_loop1b), "BEQ plot_loop1b"); + + save_pixel_opt(wp, ws); + MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC), "215MOV r_outword,r_outword,ROR #out_bpc"); + SUB(R(r_outshift), R(r_outshift), + S | IMM(wp->BPC*2) | IMMROR(6), "27SUBS r_outshift,r_outshift,#out_bpc:SHL:27"); + branch(ws, B | EQ, L(plot_loop1c), "BEQ plot_loop1c"); + + save_pixel_opt(wp, ws); + MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC), "315MOV r_outword,r_outword,ROR #out_bpc"); + SUB(R(r_outshift), R(r_outshift), + S | IMM(wp->BPC*2) | IMMROR(6), "37SUBS r_outshift,r_outshift,#out_bpc:SHL:27"); + + branch(ws, B | NE, L(plot_loop1), "8BNE plot_loop1"); + DEFINE_LABEL(plot_loop1a, "plot loop 1a - coz only one forward referance allowed") + DEFINE_LABEL(plot_loop1b, "plot loop 1b - coz only one forward referance allowed") + DEFINE_LABEL(plot_loop1c, "plot loop 1c - coz only one forward referance allowed") + + ins(ws, STR(R(r_outword), R(r_outptr)) | POSTINC(4), "9STR r_outword,[r_outptr],#4"); + + MOV(R(r_temp1), OP2R(R(r_xcount)) | S |LSRI(ws->out_l2ppw), "0MOVS r_temp1,r_xcount,LSR #out_log2ppw ; whole words to skip"); + + branch(ws, B | EQ, L(plot_loop3), "1BEQ plot_loop3"); + + for (loop = wp->BPP;loop<32;loop*=2) + ORR(R(r_pixel), R(r_pixel), OP2R(R(r_pixel)) | LSLI(loop), "2ORR r_pixel,r_pixel,r_pixel, LSL #somenumber"); + DEFINE_LABEL(plot_loop2, "2???") + ins(ws, STR(R(r_pixel), R(r_outptr)) | POSTINC(4), "3STR r_pixel,[r_outptr],#4"); + SUB(R(r_xcount), R(r_xcount), + IMM(ws->out_ppw), "4SUB r_xcount,r_xcount,#out_ppw"); + SUB(R(r_temp1), R(r_temp1), + S | IMM(1), "5SUBS r_temp1,r_temp1,#1"); + branch(ws, B | NE, L(plot_loop2), "6BNE plot_loop2"); + + if (DESTD_16_BIT) + { + MOV(R(r_pixel), OP2R(R(r_pixel)) | LSLI(16), "7MOV r_pixel, r_pixel, LSL #16 ; whole words to skip"); + MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(16), "8MOV r_pixel, r_pixel, LSR #16 ; whole words to skip"); + } + else + AND(R(r_pixel), R(r_pixel), IMM(ws->out_dpixmask), "9AND r_pixel,r_pixel,#dpix_mask"); + DEFINE_LABEL(plot_loop3, "3???") + + + ins(ws, LDR(R(r_outword), R(r_outptr)) | OFFSET(0), "0LDR r_outword,[r_outptr]"); + } +} + +static void plot_some_pixels(asm_workspace *wp, workspace *ws) +/* Non complete word pixel plot */ +{ + DEFINE_LABEL(plot_loop4, "4???") + save_pixel_opt(wp, ws); + MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC), "14MOV r_outword,r_outword,ROR #out_bpc"); + SUB(R(r_outshift), R(r_outshift), + S | IMM(wp->BPC*2) | IMMROR(6), "15SUBS r_outshift,r_outshift,#out_bpc:SHL:27"); + SUB(R(r_xcount), R(r_xcount), + S | IMM(1), "16SUBS r_xcount, r_xcount, #1"); + branch(ws, B | EQ, L(plot_loop4a), "17BEQ plot_loop4a"); + + save_pixel_opt(wp, ws); + MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC), "214MOV r_outword,r_outword,ROR #out_bpc"); + SUB(R(r_outshift), R(r_outshift), + S | IMM(wp->BPC*2) | IMMROR(6), "215SUBS r_outshift,r_outshift,#out_bpc:SHL:27"); + SUB(R(r_xcount), R(r_xcount), + S | IMM(1), "216SUBS r_xcount, r_xcount, #1"); + branch(ws, B | EQ, L(plot_loop4b), "17BEQ plot_loop4b"); + + save_pixel_opt(wp, ws); + MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC), "314MOV r_outword,r_outword,ROR #out_bpc"); + SUB(R(r_outshift), R(r_outshift), + S | IMM(wp->BPC*2) | IMMROR(6), "315SUBS r_outshift,r_outshift,#out_bpc:SHL:27"); + SUB(R(r_xcount), R(r_xcount), + S | IMM(1), "316SUBS r_xcount, r_xcount, #1"); + branch(ws, B | EQ, L(plot_loop4c), "17BEQ plot_loop4c"); + + save_pixel_opt(wp, ws); + MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC), "414MOV r_outword,r_outword,ROR #out_bpc"); + SUB(R(r_outshift), R(r_outshift), + S | IMM(wp->BPC*2) | IMMROR(6), "415SUBS r_outshift,r_outshift,#out_bpc:SHL:27"); + SUB(R(r_xcount), R(r_xcount), + S | IMM(1), "416SUBS r_xcount, r_xcount, #1"); + + branch(ws, B | NE, L(plot_loop4), "17BNE plot_loop4"); + DEFINE_LABEL(plot_loop4a, "plot loop 4a - coz only one forward referance allowed") + DEFINE_LABEL(plot_loop4b, "plot loop 4b - coz only one forward referance allowed") + DEFINE_LABEL(plot_loop4c, "plot loop 4c - coz only one forward referance allowed") +} + +/************************************************************************** +* * +* Bitblit: Overall construction of the X loop. * +* * +**************************************************************************/ + +#define ADD_A(reg,value) arbitrary_add(ws, TRUE, FALSE, &ws->regnames.reg, value); +#define ADDS_A(reg,value) arbitrary_add(ws, TRUE, TRUE, &ws->regnames.reg, value); +#define SUB_A(reg,value) arbitrary_add(ws, FALSE, FALSE, &ws->regnames.reg, value); +#define SUBS_A(reg,value) arbitrary_add(ws, FALSE, TRUE, &ws->regnames.reg, value); + +static void arbitrary_add(workspace *ws, BOOL add, BOOL s, regname *r, int value) +/* Add/subtract an arbitrary constant to a register - could be more than 8 bits. */ +{ + IFDEBUG(char a[256];) + + if (value < 0) {value = -value; add = !add;} + if (value == 0) /* special case with 0 constant */ + { + if (s) + { + IFDEBUG(do_sprintf(a, "CMP %s,#0", r->name);) + CMP(r->regno, IMM(0), a); + } + /* else, nothing */ + } + else + { + int opcode = add ? ADD_OPCODE : SUB_OPCODE; + int sopcode = s ? S : 0; + int shift_it = 0; + + while (value != 0) + { + BOOL last; + int valuebyte; + + if (value > 255) + while ((value & 3) == 0) {value >>= 2; shift_it += 2;} + valuebyte = value & 0xff; + value &= 0xffffff00; + last = value == 0; /* the last instruction needed */ + IFDEBUG( + do_sprintf(a, + (last && sopcode ? "%sS%t8.%s,%s,#&%x" : "%s%t8.%s,%s,#&%x") _ + (add ? "ADD" : "SUB") _ r->name _ r->name _ valuebyte << shift_it);) + ins(ws, opcode | (last ? sopcode : 0) + | DSTR(r->regno) | OP1R(r->regno) + | IMM(valuebyte) | IMMROR ((32 - shift_it) & 0x1e), + a); + } + } +} + +static void init_word_registers(asm_workspace *wp, workspace *ws) +/* Initialise inword, outword, maskinword from their respective pointers + * and shift values. + */ +{ + comment(ws, "Load initial values of word registers"); + + /* Set up inword */ + if (!PLOTMASK) /* PLOTMASK case handled below, because helped by setting up r_outword */ + { + if (!SOURCE_32_BIT) + { + ins(ws, LDR(R(r_inword), R(r_inptr)) | OFFSET(0), "LDR r_inword,[r_inptr] ; fetch first input pixels"); + MOV(R(r_pixel), OP2R(R(r_inshift)) | LSRI(27), "MOV r_pixel,r_inshift,LSR #27 ; get real shift distance"); + RSB(R(r_pixel), R(r_pixel), IMM(32), "RSB r_pixel,r_pixel,#32 ; temporary use of r_pixel"); + MOV(R(r_inword), OP2R(R(r_inword)) | RORR(R(r_pixel)), "MOV r_inword,r_inword,ROR r_pixel " + "; current input pixel now in least sig bit[s]"); + } + } + + if (SOURCE_MASK) /* Set up maskinword */ + { + if (SOURCE_BPPMASK || PLOTMASK) + { + ins(ws, LDR(R(r_maskinword), R(r_maskinptr)) | OFFSET(0), "LDR r_maskinword,[r_maskinptr] ; fetch first mask word"); + MOV(R(r_pixel), OP2R(R(r_maskinshift)) | LSRI(27), "MOV r_pixel,r_maskinshift,LSR #27 ; get real shift distance"); + RSB(R(r_pixel), R(r_pixel), IMM(32), "RSB r_pixel,r_pixel,#32 ; mask shift"); + } + else + ins(ws, LDR(R(r_maskinword), + R(r_inptr)) | INDEX(R(r_masko), 0), "LDR r_maskinword,[r_inptr,r_masko] ; fetch first mask word"); + MOV(R(r_maskinword), OP2R(R(r_maskinword)) | RORR(R(r_pixel)),"MOV r_maskinword,r_maskinword,ROR r_pixel " + "; current mask pixel now in least sig bit[s]"); + } + + if (!DEST_32_BIT) /* Set up outword */ + { + if (ws->gcol == 0 && !SOURCE_MASK && !PLOTMASK) + { + /* Faster in the inner loop, but the unneeded pixels must be cleared out first */ + MOV(R(r_pixel), S | OP2R(R(r_outshift)) | LSRI(27), "MOVS r_pixel,r_outshift,LSR #27 ; get real shift distance"); + ins(ws, NE | LDR(R(r_outword), R(r_outptr)) | OFFSET(0), "LDRNE r_outword,[r_outptr] ; load up output word"); + MOV(R(r_outword), NE | OP2R(R(r_outword)) + | LSLR(R(r_pixel)), "MOVNE r_outword,r_outword,LSL r_pixel " + "; set untouched pixels to correct places, clear the others"); + MOV(R(r_outword), EQ | IMM(0), "MOVEQ r_outword,#0 ; if r_pixel=0, make them all clear"); + } + else + { + ins(ws, LDR(R(r_outword), R(r_outptr)) | OFFSET(0), "LDR r_outword,[r_outptr] ; load up output word"); + MOV(R(r_pixel), OP2R(R(r_outshift)) | LSRI(27), "MOV r_pixel,r_outshift,LSR #27 ; get real shift distance"); + RSB(R(r_pixel), R(r_pixel), IMM(32), "RSB r_pixel,r_pixel,#32 ; temp use of r_pixel"); + MOV(R(r_outword), OP2R(R(r_outword)) | RORR(R(r_pixel)),"MOV r_outword,r_outword,ROR r_pixel " + "; current output pixel now in least sig bit[s]"); + /* Set up inword from ECF pattern - uses r_pixel value */ + if (PLOTMASK) + { + ins(ws, LDR(R(r_inword), R(r_inptr)) + | INDEX(R(r_ecfindex), 0), "LDR r_inword,[r_inptr,r_ecfindex] ; get ECF pattern word"); + MOV(R(r_inword), OP2R(R(r_inword)) | RORR(R(r_pixel)),"MOV r_inword,r_inword,ROR r_pixel 1 " + "; current ECF pixel now in least sig bit[s]"); + ADD(R(r_ecfindex), R(r_ecfindex), + IMM(4), "ADD r_ecfindex,r_ecfindex,#4 ; to load EOR word 1"); + ins(ws, LDR(R(r_bgcolour), R(r_inptr)) + | INDEX(R(r_ecfindex), 0), "LDR r_bgcolour,[r_inptr,r_ecfindex] ;fetch next EOR word of ECF1"); + SUB(R(r_ecfindex), R(r_ecfindex), + IMM(4), "SUB r_ecfindex,r_ecfindex,#4 ;blah1"); + MOV(R(r_bgcolour), OP2R(R(r_bgcolour)) | RORR(R(r_pixel)),"MOV r_bgcolour,r_bgcolour,ROR r_pixel 1 "); + } + } + } +} + +static void loop_x(asm_workspace *wp, workspace *ws) +/* The variables are set up - perform the inner loop that processes a + * single line. Fall out of the bottom of the loop when complete. + */ +{ + BOOL mask_possible; + + comment(ws, "The inner loop: iterating along a row of pixels."); + if (x_block_move(wp, ws)) + { + comment(ws, "Very simple inner loop - we use an existing block-move primitive"); + MOV(R(lr), OP2R(R(pc)), "MOV lr,pc ; remember return address"); + MOV(R(pc), OP2R(R(r_blockroutine)), "MOV pc,r_blockroutine ; block move"); + /* It would be a little bit more efficient to do state saving here rather than inside the routine, + * and so only save registers that need to be saved - not a big saving, and only per-line. + */ + } + else + { + init_word_registers(wp, ws); + + if (simple_x_scale(wp, ws)) /* 1:1 scaling */ + { + comment(ws, "1:1 scaling along x, so each source pixel is painted once"); + +#if 0 + align16(wp, ws); + DEFINE_LABEL(loop_x_repeat, "Loop around for each source/dest pixel") + mask_possible = fetch_pixel(wp, ws, &ws->labels.l_masked); + translate_pixel(wp, ws); + save_pixel(wp, ws); + if (mask_possible) DEFINE_LABEL(l_masked, "This pixel masked out") + fetch_pixel_inc(wp, ws); + save_pixel_inc(wp, ws); + SUB(R(r_xsize), R(r_xsize), S | IMM(1), "SUBS r_xsize,r_xsize,#1"); + branch(ws, B | NE, L(loop_x_repeat), "BNE loop_x_repeat"); +#else + + /* We generate a loop that does two pixels at a time, only advancing pointers, counts, shifts + * etc. every two pixels. There are two versions of this loop, one where the in and out shifts + * are 'in phase' (ie initially both even or both odd), one where they are out of phase. There + * is also some initial stuff to get the outshift to be even if necessary when entering either + * of these, and some final stuff to patch up the end. + */ + comment(ws, "Optimised 2-at-a-time loop"); + if (!DEST_32_BIT) + { + TST(R(r_outshift), IMM(wp->BPC*2) | IMMROR(6), "TST r_outshift,#out_bpc:SHL:27 ; start at odd or even pixel shift?"); + branch(ws, B | EQ, L(x_evenstart), "BEQ x_evenstart ; B if even"); + comment(ws, "r_outshift an odd number of pixels - process just one of these"); + mask_possible = fetch_pixel(wp, ws, &ws->labels.x_oddmask); + translate_pixel(wp, ws); + save_pixel(wp, ws); + if (mask_possible) DEFINE_LABEL(x_oddmask, "This pixel masked out") + fetch_pixel_inc(wp, ws); + save_pixel_inc(wp, ws); + SUB(R(r_xsize), R(r_xsize), S | IMM(1), "SUBS r_xsize,r_xsize,#1 ; count towards overall width"); + branch(ws, B | EQ, L(loop_x_exit), "BEQ loop_x_exit ; check for just one pixel wide"); + DEFINE_LABEL(x_evenstart, "r_outshift is an even number of pixels") + } + if (!SOURCE_32_BIT) + { + TST(R(r_inshift), IMM(ws->in_bpc*2) | IMMROR(6), "TST r_inshift,#in_bpc:SHL:27 ; input at odd or even pixel shift?"); + branch(ws, B | NE, L(x_misaligned), "BNE x_misaligned ; B if odd"); + } + branch(ws, B, L(x_aligned_enter), "B x_aligned_enter ; else, in phase with output - start loop"); + newline(); + + align16(wp, ws); + DEFINE_LABEL(x_aligned_loop, "The 2-at-a-time inner loop, aligned case") + mask_possible = fetch_pixel(wp, ws, &ws->labels.x_alignmask1); + translate_pixel(wp, ws); + save_pixel(wp, ws); + if (mask_possible) DEFINE_LABEL(x_alignmask1, "First pixel masked out") + odither_inc(wp, ws, 0); + mask_possible = fetch_pixel2(wp, ws, &ws->labels.x_alignmask2); + translate_pixel(wp, ws); + save_pixel2(wp, ws); + if (mask_possible) DEFINE_LABEL(x_alignmask2, "Second pixel masked out") + fetch_pixel_inc2(wp, ws); + save_pixel_inc2(wp, ws); + DEFINE_LABEL(x_aligned_enter, "Entering the aligned 2-at-a-time inner loop") + SUB(R(r_xsize), R(r_xsize), S | IMM(2), "SUBS r_xsize,r_xsize,#2 ; done 2 pixels"); + branch(ws, B | GE, L(x_aligned_loop), "BGE x_aligned_loop ; loop until 0 or 1 left"); + if (!SOURCE_32_BIT) + { + branch(ws, B, L(x_2atatime_exit), "B x_2atatime_exit ; final patchup code"); + newline(); + + DEFINE_LABEL(x_misaligned, "The 2-at-a-time inner loop, misaligned case, entry sequence") + /* A bit delicate - we have to prepare the input stream for an inc2 call, + * by effectively winding it back by a pixel. We know this won't go back a word, + * however, because r_inshift is an odd number of pixels. + */ + comment(ws, "Wind input stream back by a pixel"); + if (SOURCE_32_BIT) + SUB(R(r_inptr), R(r_inptr), IMM(4), "SUB r_inptr,r_inptr,#4 ; wind back a pixel"); + else + { + MOV(R(r_inword), OP2R(R(r_inword)) | LSLI(ws->in_bpp), "MOV r_inword,r_inword,LSL #in_bpp ; wind back a pixel"); + ADD(R(r_inshift), R(r_inshift), + IMM(ws->in_bpp*2) | IMMROR(6), "ADD r_inshift,r_inshift,#in_bpp:SHL:27"); + } + if (SOURCE_MASK) + { + MOV(R(r_maskinword), OP2R(R(r_maskinword)) + | LSLI(ws->mask_bpp), "MOV r_maskinword,r_maskinword,LSL #mask_bpp"); + if (SOURCE_BPPMASK) + ADD(R(r_maskinshift), R(r_maskinshift), + IMM(ws->mask_bpp*2) | IMMROR(6), "ADD r_maskinshift,r_maskinshift,#mask_bpp:SHL:27"); + } + branch(ws, B, L(x_misaligned_enter), "B x_misaligned_enter ; start misaligned loop"); + align16(wp, ws); + DEFINE_LABEL(x_misaligned_loop, "The 2-at-a-time inner loop, misaligned case") + mask_possible = fetch_pixel2(wp, ws, &ws->labels.x_misalignmask1); + translate_pixel(wp, ws); + save_pixel(wp, ws); + if (mask_possible) DEFINE_LABEL(x_misalignmask1, "A pixel masked out") + fetch_pixel_inc2(wp, ws); + odither_inc(wp, ws, 0); + mask_possible = fetch_pixel(wp, ws, &ws->labels.x_misalignmask2); + translate_pixel(wp, ws); + save_pixel2(wp, ws); + if (mask_possible) DEFINE_LABEL(x_misalignmask2, "Another pixel masked out") + save_pixel_inc2(wp, ws); + DEFINE_LABEL(x_misaligned_enter, "Entering the misaligned 2-at-a-time inner loop") + SUB(R(r_xsize), R(r_xsize), S | IMM(2), "SUBS r_xsize,r_xsize,#2 ; count towards overall size"); + branch(ws, B | GE, L(x_misaligned_loop), "BGE x_misaligned_loop ; and loop until done"); + fetch_pixel_inc(wp, ws); + newline(); + + DEFINE_LABEL(x_2atatime_exit, "Final patchup for 2-at-a-time inner loop") + } + else + newline(); + ADD(R(r_xsize), R(r_xsize), S | IMM(2), "ADDS r_xsize,r_xsize,#2 ; up to 0 or 1"); + branch(ws, B | EQ, L(loop_x_exit1), "BEQ loop_x_exit1 ; No last pixel to be done\n"); + mask_possible = fetch_pixel(wp, ws, &ws->labels.x_lastmask); + translate_pixel(wp, ws); + save_pixel(wp, ws); + if (mask_possible) DEFINE_LABEL(x_lastmask, "Last pixel masked out") + fetch_pixel_inc(wp, ws); + save_pixel_inc(wp, ws); + + DEFINE_LABEL( loop_x_exit1, "End of input pixel line (1)") +#endif + } + else + { + comment(ws, "Control of scaling along x"); + if (ws->odither && wp->save_xadd - wp->save_xdiv > wp->save_xdiv) + { + /* If dithering and scaling we have to be very careful about where we do fetch_pixel_inc, because when replicating + * a pixel we must repeatedly fetch_pixel it. + */ + SUB_A(r_xcount, wp->save_xadd) + DEFINE_LABEL( loop_x_repeat, "Loop around for each source pixel (ordered dither)") + ADD_A(r_xcount, wp->save_xadd) /*(GPS)*/ + + mask_possible = fetch_pixel(wp, ws, &ws->labels.l_masked); + SUBS_A(r_xcount, wp->save_xdiv) /* Stop dither from printing 1 too many pixels... (GPS) */ + DEFINE_LABEL( loop_put_pixel_repeat, "Repeatedly paint and ordered-dither a source pixel"); + translate_pixel(wp, ws); + save_pixel(wp, ws); + save_pixel_inc(wp, ws); + SUB(R(r_xsize), R(r_xsize), S | IMM(1), "SUBS r_xsize,r_xsize,#1 ; count output ordered dither pixels"); + branch(ws, B | EQ, L(loop_x_exit), "BEQ loop_x_exit ; painted enough pixels"); + /* We must not paint the same pixel repeatedly - we must reextract and retranslate it, otherwise + * the dithering on scaled up pixels will not occur. + */ + fetch_pixel_unmasked(wp, ws); /* reextract the pixel into r_pixel */ + SUBS_A(r_xcount, wp->save_xdiv) /* Decrement count (GPS) */ + branch(ws, B | PL, L(loop_put_pixel_repeat), "BPL loop_put_pixel_repeat ; recalculate and repaint"); + fetch_pixel_inc(wp, ws); /* moved by (GPS) */ + branch(ws, B, L(loop_x_repeat), "B loop_x_repeat ; next input pixel"); + } + else + { + if ( !PLOTMASK && (wp->save_xmag % wp->save_xdiv) == 0 && ((wp->save_xmag / wp->save_xdiv) > 4) && ws->gcol == 0) + /* do optimised code */ + { + register int toskip = wp->save_xmag / wp->save_xdiv; + + tracef("in optimised scale\nxmag = %d, xdiv = %d, xmag mod xdiv = %d\n" _ wp->save_xmag _ wp->save_xdiv _ wp->save_xmag % wp->save_xdiv); + SUB_A(r_xcount, toskip) + DEFINE_LABEL( loop_x_repeat, "3Loop around for each source pixel") + TEQ(R(r_xsize), IMM(0), "3TEQ r_xsize, #0"); + DEFINE_LABEL(loop_x_exitskip, "3Kludge to avoid multiple forward references"); + branch(ws, B | EQ, L(loop_x_exit), "3BEQ loop_x_exit"); + ADD_A(r_xcount, toskip) + mask_possible = fetch_pixel(wp, ws, &ws->labels.l_masked); + translate_pixel(wp, ws); /* If we're about the discard the pixel this is in fact wasted work - we could reorganise + * this whole loop to improve that situation, but it doesn't really seem worthwhile, the gain + * is not enormous. + */ + fetch_pixel_inc(wp, ws); + + comment(ws, "3calculating number of times to plot pixel 1"); + MOV(R(r_temp1), OP2R(R(r_xsize)), "3MOV r_temp1, r_xsize ; store r_xsize"); + SUB(R(r_xsize), R(r_xsize), S | OP2R(R(r_xcount)), "3SUBS r_xsize, r_xsize, r_xcount ; count output pixels"); + MOV(R(r_xsize), MI | IMM(0), "3MOVMI r_xsize, #0 "); + MOV(R(r_xcount), MI | OP2R(R(r_temp1)), "3MOVMI r_xcount, r_temp1 "); + + if (!DEST_32_BIT) + { + MOV(R(r_temp1), S | OP2R(R(r_outshift)) | LSRI(27), "3MOVS r_temp1, r_outshift, LSR #27"); + MOV(R(r_temp1), EQ | IMM(32), "3MOVEQ r_temp1, #32 ; 0 in r_outshift => 32 bits left"); + if (!DEST_1_BIT) + MOV(R(r_temp1), OP2R(R(r_temp1)) | LSRI(ws->out_l2bpc), "3MOV r_temp1, r_temp1, LSR #out_log2bpc"); + CMP(R(r_xcount), OP2R(R(r_temp1)), "3CMP r_xcount, r_temp1"); + branch(ws, B + LT, L(loop2), "3BLT loop2 ; end of this masked input pixel"); + } + + plot_current_output_words(wp, ws, toskip); + + if (DEST_32_BIT) + { + branch(ws, B, L(loop_x_repeat), "11B loop_x_repeat ; end of this masked input pixel"); + } + else + { + TEQ(R(r_xcount), IMM(0), "1TEQ r_xcount, #0"); + branch(ws, B + EQ, L(loop_x_repeat), "1BEQ loop_x_repeat ; end of this masked input pixel"); + + DEFINE_LABEL(loop2, "Last word to plot") + plot_some_pixels(wp, ws); + branch(ws, B, L(loop_x_repeat), "1B loop_x_repeat ; end of this masked input pixel"); + } + +#if 0 + int loop; + + comment(ws, "Doing multiple plots of same pixel in line"); + DEFINE_LABEL( loop_x_repeat, "Loop around for each source pixel") + + CMN(R(pc), OP2R(R(pc)), "CMN pc, pc ; this will clear the Z flag"); + DEFINE_LABEL(loop_x_exitskip, "Kludge to avoid multiple forward references"); + branch(ws, B | EQ, L(loop_x_exit), "BEQ loop_x_exit"); + mask_possible = fetch_pixel(wp, ws, &ws->labels.l_masked); + translate_pixel(wp, ws); /* If we're about the discard the pixel this is in fact wasted work - we could reorganise + * this whole loop to improve that situation, but it doesn't really seem worthwhile, the gain + * is not enormous. + */ + fetch_pixel_inc(wp, ws); + for (loop = 0;loop < (wp->save_xmag / wp->save_xdiv);loop++) + { + save_pixel(wp, ws); + save_pixel_inc(wp, ws); + SUB(R(r_xsize), R(r_xsize), S | IMM(1), "SUBS r_xsize,r_xsize,#1 ; count for each output pixel"); + branch(ws, B | EQ, L(loop_x_exitskip), "BEQ loop_x_exitskip"); + } + branch(ws, B , L(loop_x_repeat), "B loop_x_repeat ; discard this pixel"); +#endif + } + else + { + /* >>> There's not all that much point in this being separate from the odither case - could really + * abandon this one and use the ditering one all the time, with tiny variants. Not done. + */ + SUB_A(r_xcount, wp->save_xadd) + DEFINE_LABEL( loop_x_repeat, "Loop around for each source pixel") + ADD_A(r_xcount, wp->save_xadd) + mask_possible = fetch_pixel(wp, ws, &ws->labels.l_masked); + translate_pixel(wp, ws); /* If we're about the discard the pixel this is in fact wasted work - we could reorganise + * this whole loop to improve that situation, but it doesn't really seem worthwhile, the gain + * is not enormous. + */ + fetch_pixel_inc(wp, ws); + DEFINE_LABEL(loop_put_pixel_repeat, "Loop around to repeatedly paint a source pixel"); + SUBS_A(r_xcount, wp->save_xdiv) + branch(ws, B | MI, L(loop_x_repeat), "BMI loop_x_repeat ; discard this pixel"); + save_pixel(wp, ws); + save_pixel_inc(wp, ws); + SUB(R(r_xsize), R(r_xsize), S | IMM(1), "SUBS r_xsize,r_xsize,#1 ; count for each output pixel"); + branch(ws, B | NE, L(loop_put_pixel_repeat), "BNE loop_put_pixel_repeat"); + branch(ws, B, L(loop_x_exit), "B loop_x_exit ; skip code for masked pixels");/* moved from next if (GPS) */ + } + } + if (mask_possible) + { + DEFINE_LABEL(l_masked, "This source pixel masked out") + if (!PLOTMASK && (wp->save_xmag % wp->save_xdiv) == 0 && ((wp->save_xmag / wp->save_xdiv) > 4) && ws->gcol == 0) + { +#if 1 + fetch_pixel_inc(wp, ws); + + comment(ws, "calculating number of times to plot pixel"); + MOV(R(r_temp1), OP2R(R(r_xsize)), "@MOV r_xtemp1, r_xsize ; store r_xsize"); + SUB(R(r_xsize), R(r_xsize), S | OP2R(R(r_xcount)), "@SUBS r_xsize, r_xsize, r_xcount ; count output pixels"); + MOV(R(r_xsize), MI | IMM(0), "@MOVMI r_xsize, #0 "); + MOV(R(r_xcount), MI | OP2R(R(r_temp1)), "@MOVMI r_xcount, r_temp1 "); + + if (!DEST_32_BIT) + { + MOV(R(r_temp1), S | OP2R(R(r_outshift)) | LSRI(27), "@@MOVS r_temp1, r_outshift, LSR #27"); + MOV(R(r_temp1), EQ | IMM(32), "@@MOVEQ r_temp1, #32 ; 0 in r_outshift => 32 bits left"); + if (!DEST_1_BIT) + MOV(R(r_temp1), OP2R(R(r_temp1)) | LSRI(ws->out_l2bpc), "@@MOV r_temp1, r_temp1, LSR #log2bpc"); + CMP(R(r_xcount), OP2R(R(r_temp1)), "@@CMP r_xcount, r_temp1"); + branch(ws, B + LT, L(loop1), "@@BLT loop1 ; end of this masked input pixel"); + } + + skip_current_output_words(wp, ws); + + if (DEST_32_BIT) + { + branch(ws, B, L(loop_x_repeat), "1@B loop_x_repeat ; end of this masked input pixel"); + } + else + { + TEQ(R(r_xcount), IMM(0), "1@TEQ r_xcount, #0"); + branch(ws, B + EQ, L(loop_x_repeat), "1@BEQ loop_x_repeat ; end of this masked input pixel"); + DEFINE_LABEL(loop1, "Last word to skip") + skip_some_pixels(wp, ws); + + branch(ws, B, L(loop_x_repeat), "1@@B loop_x_repeat ; end of this masked input pixel"); + } +#else + int loop; + + fetch_pixel_inc(wp, ws); + for (loop = 0;loop < (wp->save_xmag / wp->save_xdiv);loop++) + { + save_pixel_inc(wp, ws); + SUB(R(r_xsize), R(r_xsize), S | IMM(1), "SUBS r_xsize,r_xsize,#1 ; count output pixels"); + branch(ws, B | EQ, L(loop_x_exitskip), "BEQ loop_x_exitskip"); + } + branch(ws, B, L(loop_x_repeat), "B loop_x_repeat ; end of this masked input pixel"); +#endif + } + else + { + fetch_pixel_inc(wp, ws); + DEFINE_LABEL(loop_put_masked_repeat, "Loop around to skip over dest pixels"); + SUBS_A(r_xcount, wp->save_xdiv) + branch(ws, B | MI, L(loop_x_repeat), "BMI loop_x_repeat ; end of this masked input pixel"); + save_pixel_inc(wp, ws); + SUB(R(r_xsize), R(r_xsize), S | IMM(1), "SUBS r_xsize,r_xsize,#1 ; count output pixels"); + branch(ws, B | NE, L(loop_put_masked_repeat), "BNE loop_put_masked_repeat"); + } + } + } + DEFINE_LABEL( loop_x_exit, "End of input pixel line") + newline(); + + if (!DEST_32_BIT) + { + comment(ws, "End of x loop - ensure any contents of r_outword are written out."); + MOV(R(r_outshift), S | OP2R(R(r_outshift)) | LSRI(27), "MOVS r_outshift,r_outshift,LSR #27 ; get real output shift distance"); + MOV(R(r_outshift), EQ | IMM(32), "MOVEQ r_outshift,#32 " + "; number of useful new bits in r_outword"); + if (ws->gcol == 0 && !SOURCE_MASK) + { + /* If setting pixels we must pick up the word we're about to + * partially overwrite, and combine the new and old pixels. + */ + comment(ws, "The top 32-r_outshift bits of r_outword are new pixels."); + MOV(R(r_outword), OP2R(R(r_outword)) | LSRR(R(r_outshift)),"MOV r_outword,r_outword,LSR r_outshift ; get new pixels in correct place"); + ins(ws, LDR(R(r_pixel), R(r_outptr)) | OFFSET(0), "LDR r_pixel,[r_outptr] ; temporary use of r_pixel"); + RSB(R(r_outshift), R(r_outshift), IMM(32), "RSB r_outshift,r_outshift,#32"); + MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRR(R(r_outshift)), "MOV r_pixel,r_pixel,LSR r_outshift ; shift to clear out old pixels"); + ORR(R(r_outword), R(r_outword), + OP2R(R(r_pixel)) | LSLR(R(r_outshift)), "ORR r_outword,r_outword,r_pixel, LSL r_outshift ; combine old and new"); + ins(ws, STR(R(r_outword), R(r_outptr)) | OFFSET(0), "STR r_outword,[r_outptr] ; store updated word"); + } + else + { + MOV(R(r_outword), OP2R(R(r_outword)) | RORR(R(r_outshift)),"MOV r_outword,r_outword,ROR r_outshift"); + ins(ws, STR(R(r_outword), R(r_outptr)) | OFFSET(0), "STR r_outword,[r_outptr]"); + } + } + } +} + +/************************************************************************** +* * +* Bitblit: Overall construction of the Y loop. * +* * +**************************************************************************/ + +static void loop_y(asm_workspace *wp, workspace *ws, j_decompress_ptr cinfo) +/* Overall control of the code and outer loop */ +{ + /* Declare the registers */ + int yrn; + int x_loop_save_mask; + int y_loop_save_mask; + int ptrs_save_mask; /* r_inptr, r_outptr, and (if it exists) r_maskinptr */ + int x_loop_save_size; + int ptrs_save_size; + BOOL onebank; /* one bank of registers, or two */ +#ifdef DEBUG + char xregs[256]; + char yregs[256]; + char ptrregs[256]; + char a[256]; +#endif + + /* Various useful constants not provided directly by wp. */ + newline(); + comment(ws, "Various useful constants"); + if (DPIXEL_INPUT) + comment(ws, "Double-pixel input - pixels are not the same as double-pixels"); + else + comment(ws, "Not double-pixel input - pixels are exactly the same as double-pixels"); + ws->in_bpp = 1 << wp->save_inlog2bpp; + ws->in_bpc = 1 << wp->save_inlog2bpc; + ws->in_pixmask = (1 << ws->in_bpp) - 1; + tracef("%t20.in_bpp * %i %t68; bits per input pixel\n" _ ws->in_bpp); + tracef("%t20.in_bpc * %i %t68; bits per input double-pixel ('character')\n" _ ws->in_bpc); + if (ws->in_bpp <= 8) tracef("%t20.in_pixmask * %i %t68; input pixel mask\n" _ ws->in_pixmask); + + if (SOURCE_MASK) + { + if (SOURCE_BPPMASK) /* a bit mask */ + { + ws->mask_bpp = 1; + ws->mask_bpc = 1; + ws->mask_pixmask = 1; + } + else + { + ws->mask_bpp = ws->in_bpp; + ws->mask_bpc = ws->in_bpc; + ws->mask_pixmask = ws->in_pixmask; + } + tracef("%t20.mask_bpp * %i %t68; bits per mask pixel\n" _ ws->mask_bpp); + tracef("%t20.mask_bpc * %i %t68; bits per mask double-pixel\n" _ ws->mask_bpc); + tracef("%t20.mask_pixmask * %i %t68; mask pixel mask\n" _ ws->mask_pixmask); + } + else + comment(ws, "No input mask"); + + if (DPIXEL_OUTPUT) + comment(ws, "Double-pixel output - pixels are not the same as double-pixels"); + else + comment(ws, "Not double-pixel output - pixels are exactly the same as double-pixels"); + ws->out_l2ppw = 5 - ws->out_l2bpc; + ws->out_ppw = 1 << ws->out_l2ppw; + ws->out_pixmask = (1 << wp->BPP) - 1; + ws->out_dpixmask = (1 << wp->BPC) - 1; + tracef("%t20.out_bpp * %i %t68; bits per output pixel\n" _ wp->BPP); + tracef("%t20.out_bpc * %i %t68; bits per output double-pixel\n" _ wp->BPC); + tracef("%t20.out_l2bpp * %i %t68; log base 2 of bits per output pixel\n" _ ws->out_l2bpp); + tracef("%t20.out_l2bpc * %i %t68; log base 2 of bits per output double-pixel\n" _ ws->out_l2bpc); + tracef("%t20.out_ppw * %i %t68; double-pixels per output word\n" _ ws->out_ppw); + tracef("%t20.out_l2ppw * %i %t68; log base 2 of double-pixels per output word\n" _ ws->out_l2ppw); + if (wp->BPC <= 8) + { + tracef("%t20.out_pixmask * %i %t68; output pixel mask\n" _ ws->out_pixmask); + tracef("%t20.out_dpixmask * %i %t68; output double-pixel mask\n" _ ws->out_dpixmask); + } + + /* Setting up ordered dither, if required */ + if ( !PLOTMASK /* if plotting sprite */ + && ws->in_bpp >= 16 /* from true colour source */ + && wp->BPP < ws->in_bpp /* and losing resolution */ + && (wp->dither_truecolour & 1) + && !(wp->is_it_jpeg && (wp->dither_truecolour & 2)) + ) + { + tracef("in dither_truecolour = %x\n" _ wp->dither_truecolour); + comment(ws, "Ordered dither being used"); + /* If not 0 then ws->odither is the number of bits - 1 being truncated from 8-bit source colour values */ + if (wp->BPP == 16) /* dithering down from 32 bit to 16 bit */ + ws->odither = 2; + else /* dithering down from 16 or 32 bit, to 1/2/4/8 bit. */ + { + if (ws->out_l2bpp == 3) /* 8bpp */ + { + if (wp->is_it_jpeg && cinfo->jpeg_color_space == JCS_GRAYSCALE) + ws->odither = 3; /* dither assuming 4 bits of grey represented */ + else + ws->odither = 4; /* seems to work better for colour than 3, which is what you might expect if + * you were assuming 4 bits of colour per gun. In other words, the tint is NOT + * effective enough at representing the next two bits of colour output! + * If the source is known to be greyscale then 3 is a better value. + */ + } + else + ws->odither = 6 - ws->out_l2bpp; /* 6, 5 or 4 for 2, 4, or 16 colour output (2, 4 or 8 grey level) */ + } + tracef("%t20.odither_eorvalue * 1:SHL:(24+%i) %t68; value to EOR into r_oditheradd each pixel" _ ws->odither); + } + tracef("out dither_truecolour = %x\n" _ wp->dither_truecolour); + + newline(); + ins(ws, PUSH | 0x5fff, "STMDB sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} ; save entry registers"); + newline(); + + comment(ws, "Register declarations"); + if (wp->is_it_jpeg) + { + ws->leave_r12_alone = TRUE; + comment(ws, "Leave r12 unallocated, it contains the assembler module workspace pointer"); + } + ptrs_rn(wp, ws); + ptrs_save_mask = (1<<ws->next_free_reg) - 1; + IFDEBUG(ldm_reg_list(ws, ptrregs, ptrs_save_mask, FALSE);) + ptrs_save_size = SOURCE_BPPMASK || PLOTMASK ? 12 : 8; + if (ws->odither) ptrs_save_size += 4; + + xloop_rn(wp, ws); + x_loop_save_mask = ((1<<ws->next_free_reg) - 1) & ~ptrs_save_mask; + x_loop_save_size = 4 * ws->next_free_reg - ptrs_save_size; /* size in bytes, used right at the end */ + /* Of the x-loop variables, no need to save r_inword/outword/maskinword/temp1/temp2 - set up every time round */ + if (ws->regnames.r_inword.regno != -1) {x_loop_save_mask &= ~(1 << ws->regnames.r_inword.regno); x_loop_save_size -= 4;} + if (ws->regnames.r_outword.regno != -1) {x_loop_save_mask &= ~(1 << ws->regnames.r_outword.regno); x_loop_save_size -= 4;} + if (ws->regnames.r_maskinword.regno != -1) {x_loop_save_mask &= ~(1 << ws->regnames.r_maskinword.regno); x_loop_save_size -= 4;} + if (ws->regnames.r_temp1.regno != -1) {x_loop_save_mask &= ~(1 << ws->regnames.r_temp1.regno); x_loop_save_size -= 4;} + if (ws->regnames.r_temp2.regno != -1) {x_loop_save_mask &= ~(1 << ws->regnames.r_temp2.regno); x_loop_save_size -= 4;} + IFDEBUG(ldm_reg_list(ws, xregs, x_loop_save_mask, FALSE);) + + yrn = yloop_rn_count(wp, ws); + onebank = yrn + ws->next_free_reg + ws->leave_r12_alone <= 13; + comment(ws, onebank ? "The y loop variables will fit in registers too" + : "The y loop variables are overlaid on the x ones"); + if (!onebank) ws->next_free_reg = 4; /* Overlay the x-loop register allocations - but not ptr registers */ + yloop_rn(wp, ws); + if (!onebank) /* If two banks, be prepared to do LDM/STM for the y-loop bank */ + { + y_loop_save_mask = ((1<<ws->next_free_reg) - 1) & 0xfffffff0; /* not regs 0..3 */ + IFDEBUG(ldm_reg_list(ws, yregs, y_loop_save_mask, TRUE);) + } + newline(); + + comment(ws, "Load up initial values of x-loop variables"); + fetch_pixel_init(wp, ws); + save_pixel_init(wp, ws); + xloop_init(wp, ws); + + tracef("%t20.x_loop_save_size * %t28%i %t68.; Bytes of stack for x-loop variables\n" _ x_loop_save_size); + tracef("%t20.ptrs_save_size * %t28%i %t68.; Bytes of stack for ptr variables\n" _ ptrs_save_size); + comment(ws, "Save x-loop and pointer variables on the stack"); + IFDEBUG(do_sprintf(a, "STMDB sp!,{%s,%s}", ptrregs,xregs);) + /* Added by (GPS) to get round spilled reg bug. */ + if(ws->odither && SOURCE_16_BIT) + { + tracef("x_loop_save_mask = %x\n" _ x_loop_save_mask); + x_loop_save_mask &= ~(1<<(ws->regnames.r_xcount.regno)); + x_loop_save_mask |= (1<<(ws->regnames.r_pixel.regno)); + tracef("x_loop_save_mask = %x\n" _ x_loop_save_mask); + ins(ws, PUSH | x_loop_save_mask | ptrs_save_mask, a); + x_loop_save_mask |= (1<<(ws->regnames.r_xcount.regno)); + x_loop_save_mask &= ~(1<<(ws->regnames.r_pixel.regno)); + comment(ws, "r_pixel pushed instead of x-count"); +#ifdef DEBUG + tracef("x_loop_save_mask = %x\n" _ x_loop_save_mask); +#endif + } + else + { + ins(ws, PUSH | x_loop_save_mask | ptrs_save_mask, a); + } + /* end added code...*/ + + newline(); + + comment(ws, "Load up initial values of y-loop variables"); + yloop_init(wp, ws); + + if(ws->odither && SOURCE_16_BIT) + { + MOV(R(r_xcount), OP2R(R(r_pixel)), "MOV r_xcount,r_pixel ; set r_xcount to correct value"); + } + + if (!simple_y_scale(wp, ws)) /* If not simple scaling, might not paint the first row */ + branch(ws, B, L(y_loop_enter), "B y_loop_enter ; enter the main loop"); + + /* Top of the y-loop */ + newline(); + DEFINE_LABEL(y_loop, "Loop around for each row") + + if (!simple_y_scale(wp, ws)) + { + comment(ws, "At this point the ptr registers have been updated but not saved"); + IFDEBUG(do_sprintf(a, "STMIA sp,{%s}", ptrregs);) + ins(ws, STMIA(R(sp)) | ptrs_save_mask, a); + } + + if (wp->is_it_jpeg) + { + comment(ws, "r_inptr is the source y coord for JPEG data: convert to data pointer"); + comment(ws, "fetchroutine uses r_inptr(=r0), r12. On output r_inptr=source result pointer"); + MOV(R(lr), OP2R(R(pc)), "MOV lr,pc ; remember return address from fetchroutine"); + MOV(R(pc), OP2R(R(r_fetchroutine)), "MOV pc,r_fetchroutine ; get source address"); + LDR_WP_C(lr, in_x, "returned value is for base of line - add initial offset") + if (wp->save_inlog2bpp < 5) + { + if (wp->save_inlog2bpp == 3) + ADD(R(r_inptr),R(r_inptr),OP2R(R(lr)), "ADD r_inptr,r_inptr,lr ; add in_x as byte offset"); + else + ADD(R(r_inptr),R(r_inptr),OP2R(R(lr)) | LSLI(1), "ADD r_inptr,r_inptr,lr,LSL#1 ; add in_x as halfword offset"); + BIC(R(r_inptr),R(r_inptr),IMM(3), "BIC r_inptr,r_inptr,#3 ; r_inptr is a word pointer"); + } + else + ADD(R(r_inptr),R(r_inptr),OP2R(R(lr)) | LSLI(2), "ADD r_inptr,r_inptr,lr,LSL#2 ; add in_x as word offset"); + } + + if (!onebank) + { + /* the x-loop variables are already set up, with inptr/outptr/maskinptr saved at new values */ + ADD(R(lr), R(sp), IMM(ptrs_save_size), "ADD lr,sp,#ptrs_save_size"); + IFDEBUG(do_sprintf(a, "STMDB sp!,{%s} %t40; push y-loop variables", yregs);) + ins(ws, PUSH | y_loop_save_mask, a); + IFDEBUG(do_sprintf(a, "LDMIA lr,{%s} %t40; load x-loop variables", xregs);) + ins(ws, LDMIA(R(lr)) | x_loop_save_mask, a); /* Reload the x-loop variables */ + } + newline(); + + /* Generate the inner loop. */ + loop_x(wp, ws); + + /* Suitable register 'bank' swapping. */ + if (onebank) + { + IFDEBUG(do_sprintf(a, "LDMIA sp,{%s,%s} %t40; reload x-loop and ptr registers", ptrregs, xregs);) + ins(ws, LDMIA(R(sp)) | x_loop_save_mask | ptrs_save_mask, a); + } + else + { + IFDEBUG(do_sprintf(a, "LDMIA sp!,{%s} %t40; pop y-loop variables", yregs);) + ins(ws, POP | y_loop_save_mask, a); + newline(); + comment(ws, "Reload pointers to the start of a row"); + IFDEBUG(do_sprintf(a, "LDMIA sp,{%s} %t40; reload ptr registers", ptrregs);) + ins(ws, LDMIA(R(sp)) | ptrs_save_mask, a); + } + + /* Control of scaling in the y direction */ + if (simple_y_scale(wp, ws)) + { + comment(ws, "1:1 scaling in y direction - each source row appears once"); + if (!PLOTMASK) + { + if (wp->is_it_jpeg) + ADD(R(r_inptr), R(r_inptr), IMM(1), "ADD r_inptr,r_inptr,#1 ; inc y coord of input JPEG data"); + else + SUB(R(r_inptr), R(r_inptr), OP2R(R(r_inoffset)), "SUB r_inptr,r_inptr,r_inoffset"); + } + SUB_A(r_outptr,wp->save_outoffset) /*SUB r_outptr,r_outptr,#outoffset*/ + odither_inc(wp, ws, 1); /* advance to next coord */ + odither_inc(wp, ws, 0); /* ensure X coord phase alternates on alternate lines */ + if (SOURCE_BPPMASK || PLOTMASK) + SUB(R(r_maskinptr), R(r_maskinptr), + OP2R(R(r_maskinoffset)), "SUB r_maskinptr,r_maskinptr,r_maskinoffset"); + IFDEBUG(do_sprintf(a, "STMIA sp,{%s} %t40.; Save updated ptr registers", ptrregs);) + ins(ws, STMIA(R(sp)) | ptrs_save_mask, a); + SUB(R(r_ysize), R(r_ysize), S | IMM(1), "SUBS r_ysize,r_ysize,#1 ; decrement output pixel size"); + branch(ws, B | GT, L(y_loop), "BGT y_loop"); + } + else + { + SUB(R(r_ysize), R(r_ysize), S | IMM(1), "SUBS r_ysize,r_ysize,#1"); + branch(ws, B | LE, L(y_loop_exit), "BLE y_loop_exit"); + SUB_A(r_outptr,wp->save_outoffset) /*SUB r_outptr,r_outptr,#outoffset*/ + odither_inc(wp, ws, 1); + odither_inc(wp, ws, 0); + + if (PLOTMASK) + { + comment(ws, "Advance ECF pointer"); + LDR_WP(r_pixel, save_ecflimit); /*LDR r_pixel,save_ecflimit*/ + CMP(R(r_inptr), OP2R(R(r_pixel)), "CMP r_inptr,r_pixel ; check for bottom of ECF"); + ADD(R(r_inptr), R(r_inptr), EQ | IMM(64), "ADDEQ r_inptr,r_inptr,#64 ; and if reached, reset to top"); + SUB(R(r_inptr), R(r_inptr), IMM(8), "SUB r_inptr,r_inptr,#8 ; points to base of current row of ECF"); + } + comment(ws, "Control of scaling in y direction"); + DEFINE_LABEL( y_loop_enter, "Initial entry into the loop") + SUBS_A(r_ycount, wp->save_ydiv) /*SUBS r_ycount,r_ycount,#ydiv*/ + branch(ws, B | PL, L(y_loop), "BPL y_loop ; if count>=0 then B else next source row"); + if (!PLOTMASK) + { + if (wp->is_it_jpeg) + ADD(R(r_inptr), R(r_inptr), IMM(1), "ADD r_inptr,r_inptr,#1 ; inc y coord of source JPEG data"); + else + SUB(R(r_inptr), R(r_inptr), OP2R(R(r_inoffset)), "SUB r_inptr,r_inptr,r_inoffset ; next source row"); + } + if (SOURCE_BPPMASK || PLOTMASK) + SUB(R(r_maskinptr), R(r_maskinptr), + OP2R(R(r_maskinoffset)), "SUB r_maskinptr,r_maskinptr,r_maskinoffset ; advance input mask pointer"); + ADD_A(r_ycount, wp->save_ydiv + wp->save_yadd) /*ADD r_ycount,r_ycount,#(ydiv+yadd)*/ + branch(ws, B, L(y_loop_enter), "B y_loop_enter ; reenter the main loop"); + DEFINE_LABEL(y_loop_exit, "Exit from y loop") + } + newline(); + comment(ws, "Discard workspace, restore registers, and exit"); + + ADD(R(sp), R(sp), IMM(x_loop_save_size+ptrs_save_size), "ADD sp,sp,#x_loop_save_size+ptrs_save_size ; discard saved x-loop variables"); + + ins(ws, POP | 0x5fff, "LDMIA sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} ; restore, exit"); + + MOV(R(pc), OP2R(R(lr)), "MOV pc, lr"); +} + +/************************************************************************** +* * +* Bitblit: The main compiler entry points. * +* * +**************************************************************************/ + +static blitter find_or_compile_code(asm_workspace *wp, workspace *ws, j_decompress_ptr cinfo) +/* Based on the workspace variables look through existing compiled buffers for an existing match */ +{ + code_buffer *p; + int key_word; + + key_word = wp->save_inlog2bpp /* 0..2 */ + + (ws->out_l2bpp << 3) /* 3..5 */ + + (ws->gcol << 6); /* 6..8 */ + if (SOURCE_MASK) key_word |= 1<<9; + if (SOURCE_BPPMASK) key_word |= 1<<10; + if (wp->trns_palette != 0) key_word |= 1<<11; + if (wp->ColourTTR != 0) key_word |= 1<<12; + if (wp->BPP != wp->BPC) key_word |= 1<<13; + if (wp->save_inlog2bpp != wp->save_inlog2bpc) key_word |= 1<<14; + if (PLOTMASK) key_word |= 1<<15; +#ifdef ASMjpeg + if (wp->is_it_jpeg) key_word |= 1<<16; + if (wp->is_it_jpeg && cinfo->jpeg_color_space == JCS_GRAYSCALE) key_word |= 1<<17; +#endif + if (wp->cal_table) + { + key_word |= 1<<18; + if (ws->cal_table_simple) key_word |= 1<<19; + if (wp->cal_table->tablecount == 3) key_word |= 1<<20; + } +#ifdef ASMjpeg + if (wp->is_it_jpeg && (wp->dither_truecolour & 1)) key_word |= 1<<21; + if (wp->is_it_jpeg && (wp->dither_truecolour & 2)) key_word |= 1<<22; +#endif + tracef("Searching for compiled code for key_word=%x, scale=%i:%i,%i:%i outoffset=%x.\n" _ + key_word _ wp->save_xadd - wp->save_xdiv _ wp->save_xdiv _ wp->save_yadd _ wp->save_ydiv _ wp->save_outoffset); + tracef("simple_x_scale=%s x_block_move=%s jpeg=%s calibration table=0x%x\n" + _ whether(simple_x_scale(wp, ws)) + _ whether(x_block_move(wp, ws)) + _ whether(wp->is_it_jpeg) + _ wp->cal_table); + FOR_EACH_BUFFER(p) + if ( p->key_word == key_word + && p->xadd == wp->save_xadd + && p->xdiv == wp->save_xdiv + && p->yadd == wp->save_yadd + && p->ydiv == wp->save_ydiv + && p->outoffset == wp->save_outoffset + ) + { + tracef("Found existing compiled code in buffer %x.\n" _ p); + + return (blitter)p->code; + } + p = &ws->buffers[ws->build_buffer]; + p->key_word = -1; /* Not set unless we complete the compilation - see below */ + p->xadd = wp->save_xadd; + p->xdiv = wp->save_xdiv; + p->yadd = wp->save_yadd; + p->ydiv = wp->save_ydiv; + p->outoffset = wp->save_outoffset; + tracef("Compiler initialised for buffer at %x.\n" _ p); + compile_buffer_init(wp, ws); + + /* Now we actually do the compile */ + loop_y(wp, ws, cinfo); + + compile_buffer_done(ws); + p->key_word = key_word; + + /* Just did some dynamic code generation so flush the I cache */ + _swix(OS_SynchroniseCodeAreas, _IN(0) | _IN(1) | _IN(2), 1, + (int)ws->compile_base, (int)ws->compile_base + ((BUFSIZE - 1 /* Inclusive */) * sizeof(int))); + + return (blitter)ws->compile_base; +} + +blitter putscaled_compiler(asm_workspace *wp, workspace *ws, workspace *ws_end, int gcol) +/* Main entrypoint from the assembler */ +{ + j_decompress_ptr cinfo = NULL; + int i, j; + blitter result; + + /* Check that the assembler has an adequate opinion of our workspace needs. */ + tracef("wp=%x ws=%x ws_end=%x.\n" _ wp _ ws _ ws_end); + tracef("Size of assembler workspace: %i.\n" _ ((char*)ws) - ((char*)wp)); + tracef("Size of C workspace: %i.\n" _ ((char*)ws_end) - ((char*)ws)); + assert(ws_end > ws, ERROR_FATAL); + check_workspace(ws); + IFDEBUG(dump_asm_workspace(wp);) + + ws->gcol = gcol & 7; + ws->masked = (gcol & 8) != 0;/* || PLOTMASK;*/ + ws->mask1bpp = ws->masked & (((wp->save_mode) >> 27) != SpriteType_Old); + ws->odither = FALSE; /* Set more carefully later. */ + tracef("gcol=%i (& 7 = %i) %t32. GCOL action - 0 for plot, 1..7 for various others.\n" _ gcol _ gcol & 7); + tracef("masked=%s %t32. whether to use mask.\n" _ whether(ws->masked)); + tracef("1bpp mask=%s %t32. whether mask is new format.\n" _ whether(ws->mask1bpp)); + +#ifdef ASMjpeg + if (wp->is_it_jpeg) + { + sprite_header *s = wp->save_sprite; + int *compress_id_word = (int*)((char*) s + s->image); /* The first word of the sprite data */ + char *jpeg_data; + int jpeg_data_size, jpeg_ws_size; + int opt, err, xmax; + + assert(compress_id_word[0] == -1, ERROR_BAD_JPEG); + tracef("This JPEG sprite was constructed by PutJPEGScaled\n"); + jpeg_data = (char*)compress_id_word[1]; + jpeg_data_size = compress_id_word[2]; + jpeg_ws_size = compress_id_word[3]; + check_jpeg_workspace(wp, jpeg_ws_size); + cinfo = wp->jpeg_info_ptr; + + assert(wp->save_inlog2bpp == 5, ERROR_FATAL); /* 32bpp source */ + assert(!SOURCE_MASK, ERROR_FATAL); /* no mask */ + tracef("JPEG, initial source coords are %i,%i.\n" _ wp->in_x _ wp->in_y); + if ((wp->save_mode >> 27) == 0) + { + /* Old-style mode - make sure no translation table present. */ + wp->ColourTTR = 0; /* >>>> mainly for JPEG on RO3 */ + wp->trns_palette = 0; /* >>>> mainly for JPEG on RO3 */ + } + + /* Deduce the decompression options */ + opt = jpeg_decompressor_opts(cinfo, wp); + + /* Reverse scaling calculation */ + xmax = wp->in_x + 2 + (wp->save_xsize * wp->save_xdiv) / (wp->save_xadd - wp->save_xdiv); + if (xmax < 0) xmax = s->width; /* set safe xmax if reverse scale calculation overflowed */ + + /* Initialise the decompressor */ + err = jpeg_scan_file(cinfo, jpeg_data, jpeg_data_size, wp->in_x, xmax, -1, -1, opt); + assert(err == 0, ERROR_BAD_JPEG); + + /* Check the decompressor agreed with proposed output options */ + if (cinfo->error_argument1 & (jopt_OUTBPP_8 | jopt_OUTBPP_8YUV | jopt_OUTBPP_8GREY)) /* we asked for it, and we got it - 8bpp output pixels */ + { + tracef("actually doing new shiny 8BPP plotting technique\n"); + wp->save_inlog2bpp = wp->save_inlog2bpc = 3; + wp->ColourTTR = 0; + } + else + { + if (cinfo->error_argument1 & jopt_OUTBPP_16) /* we asked for it, and we got it - 16bpp output pixels */ + wp->save_inlog2bpp = wp->save_inlog2bpc = 4; + } + } +#endif + +#ifdef DEBUG + /* Additional mask tracing */ + if (PLOTMASK) + { + char *p; + int *ecf = (int*) wp->save_ecflimit; + + tracef("Sprite data:\n"); + p = (char*) wp->save_inptr; + for (i = 0; i < 16; i++) + { + tracef("%x" _ p); + for (j = 0; j < 16; j++) tracef(" %2x" _ p[j]); + newline(); + p -= wp->save_inoffset; /* convert from byte offset to int offset */ + } + + tracef("Mask data:\n"); + p = (char*) (SOURCE_BPPMASK ? wp->save_maskinptr : (int) wp->save_inptr + wp->save_masko); + for (i = 0; i < 16; i++) + { + tracef("%x" _ p); + for (j = 0; j < 16; j++) tracef(" %2x" _ p[j]); + newline(); + p -= wp->save_inoffset; + } + + tracef("ECF pattern:\n"); + for (i = 0; i <= 8; i++) + tracef("%x: %c %x %x\n" _ ecf + 2*i _ (ecf+2*i == (int*)wp->save_ecfptr ? '>' : ' ') _ ecf[2*i] _ ecf[2*i + 1]); + } +#endif + + if (wp->cal_table) + { + calibration_table *t = wp->cal_table; + + ws->cal_table_simple = t->idealblack == 0 && t->idealwhite == 0xffffff00 && t->postprocessSWI == 0; +#ifdef DEBUG + tracef("Calibration table at 0x%x: version=%i idealblack=0x%x idealwhite=0x%x postprocessSWI=0x%x tablecount=%i simple=%s.\n" + _ t->version _ t->idealblack _ t->idealwhite _ t->postprocessSWI _ t->tablecount _ whether(ws->cal_table_simple)); + for (i = 0; i < 256; i++) tracef(" %i" _ t->redtable[i]); newline(); + if (t->tablecount == 3) for (i = 0; i < 256; i++) tracef(" %i" _ t->greentable[i]); newline(); + if (t->tablecount == 3) for (i = 0; i < 256; i++) tracef(" %i" _ t->bluetable[i]); newline(); +#endif + assert(wp->BPP == 32, ERROR_FATAL); /* only to 32 bit dest */ + assert(wp->save_inlog2bpp >= 4, ERROR_FATAL); /* only from 16 or 32 bit source */ + assert(!SOURCE_TABLE, ERROR_FATAL); /* there isn't room for a calibration table and another table - they share r_table */ + assert(t->version == 0, ERROR_FATAL); /* check version number of lookup table */ + } + + /* Compute l2bpp from BPP of output - all we're given. */ + i = 0; + j = wp->BPP; + while (j > 1) + { + j = j >> 1; i++; + } + ws->out_l2bpp = i; + if (wp->BPP != wp->BPC) i++; + ws->out_l2bpc = i; + + /* If using a palette, ignore any translation table */ + if (wp->trns_palette != 0) wp->ColourTTR = 0; + + /* Simplify scale factors - >>> is this useful? Helps spot 1:1 scaling I guess? */ + assert(wp->save_xadd > 0, ERROR_FATAL); + assert(wp->save_xdiv > 0, ERROR_FATAL); + assert(wp->save_ydiv > 0, ERROR_FATAL); + assert(wp->save_ydiv > 0, ERROR_FATAL); + while ((wp->save_xadd & 1) == 0 && + (wp->save_xdiv & 1) == 0 && + (wp->save_xcount & 1) == 0 && + (wp->save_xmag & 1) == 0) + { + wp->save_xadd >>= 1; wp->save_xdiv >>= 1; + wp->save_xcount >>= 1; wp->save_xmag >>=1; + } + while ((wp->save_yadd & 1) == 0 && + (wp->save_ydiv & 1) == 0 && + (wp->save_ycount & 1) == 0) + { + wp->save_yadd >>= 1; wp->save_ydiv >>= 1; + wp->save_ycount >>= 1; + } + + /* Look for unit translation table */ +#ifdef DEBUG + if (wp->ColourTTR != 0 && wp->BPP == (1<<wp->save_inlog2bpp)) /* only if table, and depth matches */ + { + char *t = (char*) wp->ColourTTR; + BOOL same = TRUE; + int size = 1 << (1 << (wp->save_inlog2bpp == 5 ? 4 : wp->save_inlog2bpp)); + + if (wp->save_xsize * wp->save_ysize > size) /* Unless huge table for tiny sprite */ + { + for (i = 0; i < size; i++) + if (t[i] != i) {same = FALSE; break;} + if (same) + { + tracef("Unit translation table - discarded\n"); + wp->ColourTTR = 0; + assert(0, ERROR_FATAL); /* These are now zapped by the assembler, so they shouldn't ever turn up. */ + } + } + } +#endif + + /* Precise handling of double-pixel modes by the surrounding code is still unclear to me! + * When it enters this code bpc!=bpp can still be the case, but it seems that the actual + * value of bpc is best ignored, it has all been frigged into the scale factors. Avoid + * this issue for now, but note that we must set the values back afterwards because they + * can be reused on the next sprite plot, if the source sprite mode word is the same. + */ + i = wp->BPC; + j = wp->save_inlog2bpc; + wp->BPC = wp->BPP; + wp->save_inlog2bpc = wp->save_inlog2bpp; + result = find_or_compile_code(wp, ws, cinfo); + wp->BPC = i; + wp->save_inlog2bpc = j; + + return result; +} +#endif