diff --git a/Sources/jdhuff b/Sources/jdhuff
index dd0605f131478705582e86913b550cb5ea8a45a5..97a76e52bd44349359905e1502f32c0e70411b57 100644
--- a/Sources/jdhuff
+++ b/Sources/jdhuff
@@ -56,7 +56,7 @@ h_l        RN      2      ; used in huff_decode - same as temp3
         HUFF_DECODE_SETUP $tbl
         ; set up the specific table pointers for ac or dc huff table
-        ; $tbl is a HUFF_TBL*.
+        ; $tbl is a JHUFF_TBL*.
         ADD     h_maxcode,$tbl,#huff_tbl_maxcode
         ADD     h_shortcut,$tbl,#huff_tbl_shortcut
         ADD     h_huffval,$tbl,#huff_tbl_huffval
@@ -221,16 +221,15 @@ huff_decode_loop$lab
 ; --------------------------------------------------------------------
         EXPORT  asm_huff_decode_blocks
-;LOCAL void
-;huff_decode_blocks (decompress_info_ptr cinfo, JBLOCK block,
-;                    HUFF_TBL *dctbl, HUFF_TBL *actbl,
-;                    QUANT_TBL_PTR quanttbl, int *last_dc_val, int nblocks)
+;huff_decode_blocks (j_decompress_ptr cinfo, JBLOCK block,
+;                    JHUFF_TBL *dctbl, JHUFF_TBL *actbl,
+;                    JQUANT_TBL *quanttbl, JCOEF *last_dc_val, int nblocks)
 ;  r0 = cinfo
 ;  r1 = block pointer
-;  r2 = HUFF_TBL* dctbl
-;  r3 = HUFF_TBL* actbl
-;  [sp] = quanttbl
-;  [sp,#4] = int *last_dc_val
+;  r2 = JHUFF_TBL *dctbl
+;  r3 = JHUFF_TBL *actbl
+;  [sp,#0] = quanttbl
+;  [sp,#4] = JCOEF *last_dc_val
 ;  [sp,#8] = int nblocks
 ; save registers
@@ -283,7 +282,7 @@ huff_block_clear
 huff_anotherblock                                ; loop round to here nblocks times
 ; Set up huffman decoding for the DC component.
-        LDR     h_temp,[sp,#2*4]                 ; HUFF_TBL* dctbl    (DC table pointer)
+        LDR     h_temp,[sp,#2*4]                 ; JHUFF_TBL* dctbl    (DC table pointer)
         HUFF_DECODE_SETUP h_temp                 ; set maxcode,huffval,shortcut
 ; Handle the DC component
@@ -310,7 +309,7 @@ huff_dc_0
         ; That's the DC value done.
 ; Set up huffman decoding for the AC components.
-        LDR     h_temp,[sp,#3*4]                 ; HUFF_TBL* actbl    (AC table pointer)
+        LDR     h_temp,[sp,#3*4]                 ; JHUFF_TBL* actbl    (AC table pointer)
         HUFF_DECODE_SETUP h_temp                 ; set maxcode,huffval,shortcut
 ; The loop that does AC components, once round for each non-zero component.
@@ -380,16 +379,15 @@ huff_zag_end
 ; --------------------------------------------------------------------
         EXPORT  asm_huff_skip_blocks
-;LOCAL void
-;huff_skip_blocks (decompress_info_ptr cinfo, JBLOCK block,
-;                  HUFF_TBL *dctbl, HUFF_TBL *actbl,
-;                  QUANT_TBL_PTR quanttbl, int *last_dc_val, int nblocks)
+;huff_skip_blocks (j_decompress_ptr cinfo, JBLOCK block,
+;                  JHUFF_TBL *dctbl, JHUFF_TBL *actbl,
+;                  JQUANT_TBL *quanttbl, JCOEF *last_dc_val, int nblocks)
 ;  r0 = cinfo
 ;  r1 = block pointer             (UNUSED)
-;  r2 = HUFF_TBL* dctbl
-;  r3 = HUFF_TBL* actbl
-;  [sp] = quanttbl                (UNUSED)
-;  [sp,#4] = int *last_dc_val
+;  r2 = JHUFF_TBL *dctbl
+;  r3 = JHUFF_TBL *actbl
+;  [sp,#0] = quanttbl             (UNUSED)
+;  [sp,#4] = JCOEF *last_dc_val
 ;  [sp,#8] = int nblocks
 ; This routine is very similar to huff_decode_blocks, except that
 ; we do not actually output the block - we simply skip forward that far
@@ -415,7 +413,7 @@ asm_huff_skip_blocks
 huff_skip_anotherblock                           ; loop round to here nblocks times
 ; Set up huffman decoding for the DC component.
-        LDR     h_temp,[sp,#2*4]                 ; HUFF_TBL* dctbl    (DC table pointer)
+        LDR     h_temp,[sp,#2*4]                 ; JHUFF_TBL* dctbl    (DC table pointer)
         HUFF_DECODE_SETUP h_temp                 ; set maxcode,huffval,shortcut
 ; Handle the DC component
@@ -435,7 +433,7 @@ huff_skip_dc_0
         ; That's the DC value done.
 ; Set up huffman decoding for the AC components.
-        LDR     h_temp,[sp,#3*4]                 ; HUFF_TBL* actbl    (AC table pointer)
+        LDR     h_temp,[sp,#3*4]                 ; JHUFF_TBL* actbl    (AC table pointer)
         HUFF_DECODE_SETUP h_temp                 ; set maxcode,huffval,shortcut
 ; The loop that does AC components, once round for each non-zero component.
diff --git a/Sources/jrevdct b/Sources/jrevdct
index 6e70826c1d3db460b8f50749155dbdac627276af..d6abfda4fbd2a8252d43f14403e157ab98c99278 100644
--- a/Sources/jrevdct
+++ b/Sources/jrevdct
@@ -266,7 +266,7 @@ $rc._odd_shortcut
 ;; ------------------------------------------------------------------------
 ;; Test proc - procedure to do a 1-D DCT
 ;; ------------------------------------------------------------------------
-;; extern void dct_1d(decompress_info_ptr cinfo, int *data);
+;; extern void dct_1d(j_decompress_ptr cinfo, int *data);
 ;        STMDB   sp!,{r0-r12,lr}          ; save state
@@ -290,7 +290,7 @@ $rc._odd_shortcut
 ; r2=count
         EXPORT  asm_j_rev_dct
-asm_j_rev_dct                             ; extern void asm_j_rev_dct(decompress_info_ptr cinfo, DCTBLOCK data, int count);
+asm_j_rev_dct                             ; extern void asm_j_rev_dct(j_decompress_ptr cinfo, JBLOCK data, int count);
         CMP     r2,#0                     ; if count=0, do nothing
         MOVLE   pc,lr
diff --git a/VersionASM b/VersionASM
index cc446a944968d2d284c884d647c1094b5114fcc3..d3f0d9ebe2ebe8922ce1c5ede272f025d413a9aa 100644
--- a/VersionASM
+++ b/VersionASM
@@ -13,11 +13,11 @@
                         GBLS    Module_ComponentPath
 Module_MajorVersion     SETS    "1.38"
 Module_Version          SETA    138
-Module_MinorVersion     SETS    ""
-Module_Date             SETS    "23 Dec 2010"
-Module_ApplicationDate  SETS    "23-Dec-10"
+Module_MinorVersion     SETS    ""
+Module_Date             SETS    "04 Jan 2011"
+Module_ApplicationDate  SETS    "04-Jan-11"
 Module_ComponentName    SETS    "SprExtend"
 Module_ComponentPath    SETS    "mixed/RiscOS/Sources/Video/Render/SprExtend"
-Module_FullVersion      SETS    "1.38"
-Module_HelpVersion      SETS    "1.38 (23 Dec 2010)"
+Module_FullVersion      SETS    "1.38 ("
+Module_HelpVersion      SETS    "1.38 (04 Jan 2011)"
diff --git a/VersionNum b/VersionNum
index 0aec99713561dc3470f676814000652a7cf3d31d..d6c88986ee5a47205996a142d1f5dbac7b5de6b4 100644
--- a/VersionNum
+++ b/VersionNum
@@ -5,19 +5,19 @@
 #define Module_MajorVersion_CMHG        1.38
-#define Module_MinorVersion_CMHG        
-#define Module_Date_CMHG                23 Dec 2010
+#define Module_MinorVersion_CMHG
+#define Module_Date_CMHG                04 Jan 2011
 #define Module_MajorVersion             "1.38"
 #define Module_Version                  138
-#define Module_MinorVersion             ""
-#define Module_Date                     "23 Dec 2010"
+#define Module_MinorVersion             ""
+#define Module_Date                     "04 Jan 2011"
-#define Module_ApplicationDate          "23-Dec-10"
+#define Module_ApplicationDate          "04-Jan-11"
 #define Module_ComponentName            "SprExtend"
 #define Module_ComponentPath            "mixed/RiscOS/Sources/Video/Render/SprExtend"
-#define Module_FullVersion              "1.38"
-#define Module_HelpVersion              "1.38 (23 Dec 2010)"
+#define Module_FullVersion              "1.38 ("
+#define Module_HelpVersion              "1.38 (04 Jan 2011)"
 #define Module_LibraryVersionInfo       "1:38"
diff --git a/c/putscaled b/c/putscaled
new file mode 100644
index 0000000000000000000000000000000000000000..154a13226826ad366d0a3a2910ba7a3ce05c9938
--- /dev/null
+++ b/c/putscaled
@@ -0,0 +1,3486 @@
+/* Copyright 2011 Castle Technology Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* c.PutScaled - the bitblit compiler for PutSpriteScaled/PlotMaskScaled */
+#include <stdarg.h>
+#include <string.h>
+#include <stdio.h>
+#include "swis.h"
+#include "commondefs.h"
+#include "jpeglib.h"
+#include "putscaled.h"
+#include "C:Global.h.Sprite"
+*                                                                         *
+*    Macros.                                                              *
+*                                                                         *
+#define SOURCE_32_BIT  (wp->save_inlog2bpp == 5)
+#define SOURCE_16_BIT  (wp->save_inlog2bpp == 4)
+#define SOURCED_16_BIT (wp->save_inlog2bpc == 4) /* like SOURCE_16_BIT but includes 16-bit double-pixels */
+#define SOURCE_MASK    (ws->masked)
+#define SOURCE_BPPMASK (ws->mask1bpp)
+#define SOURCE_TABLE   ((wp->ColourTTR != 0) || (wp->trns_palette != 0))
+#define DPIXEL_INPUT   (wp->save_inlog2bpp != wp->save_inlog2bpc)
+#define DPIXEL_OUTPUT  (wp->BPP != wp->BPC)
+#define PLOTMASK       ((wp->spritecode & 255) == SpriteReason_PlotMaskScaled)
+#define DEST_32_BIT    (wp->BPP == 32)
+#define DEST_16_BIT    (wp->BPP == 16)
+#define DEST_1_BIT     (wp->BPC == 1)
+#define DESTD_16_BIT   (wp->BPC == 16) /* like DEST_16_BIT but includes 16-bit double-pixels */
+*                                                                         *
+*    Low-level debugging output.                                          *
+*                                                                         *
+#ifdef DEBUG
+#define tracef(args)     do_sprintf(0, args)
+#define assert(x, y)     do_assert(__LINE__, x, y, NULL)
+#define newline()        tracef("\n");
+#define comment(ws,text) do_comment(text)
+#define IFDEBUG(a) a
+#include "tracing.c"
+#define tracef(args)     /* Nothing */
+#define assert(x, y)     {if (!(x)) exit_erl(y, __LINE__);}
+#define newline()        /* Nothing */
+#define comment(ws,text) /* Nothing */
+#define IFDEBUG(a)       /* Nothing */
+*                                                                         *
+*    JPEG handling.                                                       *
+*                                                                         *
+#ifdef ASMjpeg
+#include "rojpeg.c"
+*                                                                         *
+*    C Workspace declarations.                                            *
+*                                                                         *
+/* Code buffers */
+#define NBUFFERS 8       /* Number of code buffers */
+#define BUFSIZE 256      /* words per buffer */
+typedef struct
+  int key_word;              /* descriptor for this code, or -1 if empty */
+  int xadd;                  /* precise scale factors compiled into this code */
+  int xdiv;
+  int yadd;
+  int ydiv;
+  int outoffset;             /* output row offset compiled into this code */
+  int code[BUFSIZE];         /* the code itself */
+} code_buffer;
+#define FOR_EACH_BUFFER(ptr) for (ptr = &ws->buffers[0]; ptr < &ws->buffers[NBUFFERS]; ptr++)
+/* Labels - there's one of these for each label in the source we generate. */
+typedef struct
+  int *def;          /* where the label is, or 0 if not yet defined. */
+  int *ref;          /* a reference to the label, to be filled in when it's defined. */
+#ifdef DEBUG
+  char *name;        /* textual name of the label - same as field name */
+} label;
+/* Each label must be added as a field to this structure. */
+typedef struct
+  #define FIRST_LABEL loop_y_repeat
+  label loop_y_repeat;
+  label test1;
+  label test2;
+  label loop_x_enter;
+  label loop_x_repeat;
+  label loop_x_exit;
+  label l_masked;
+  label loop_put_pixel_repeat;
+  label loop_put_masked_repeat;
+  label y_loop;
+  label y_loop_enter;
+  label y_loop_exit;
+  label loop_delay;
+  label x_evenstart;
+  label x_oddmask;
+  label x_aligned_loop;
+  label x_aligned_enter;
+  label x_alignmask1;
+  label x_alignmask2;
+  label x_misaligned;
+  label x_misaligned_loop;
+  label x_misaligned_enter;
+  label x_misalignmask1;
+  label x_misalignmask2;
+  label x_2atatime_exit;
+  label x_lastmask;
+  label loop_x_exit1;
+  label loop_x_exitskip;
+  label loop1;
+  label loop2;
+  label plot_loopa;
+  label plot_loop1;
+  label plot_loop1a;
+  label plot_loop1b;
+  label plot_loop1c;
+  label plot_loop2;
+  label plot_loop3;
+  label plot_loop4;
+  label plot_loop4a;
+  label plot_loop4b;
+  label plot_loop4c;
+  label last;
+  #define LAST_LABEL last
+  /* If you add a label, add giving it a name in check_workspace */
+} labels_rec;
+#define FOR_EACH_LABEL(ptr) for (ptr = &ws->labels.FIRST_LABEL; ptr <= &ws->labels.LAST_LABEL; ptr++)
+#define L(name) (&(ws->labels.name))
+/* Register names - one for each register name (the register numbers are allocated at compile time) */
+typedef struct
+  int regno;     /* the physical register number */
+#ifdef DEBUG
+  char *name;    /* the name, for trace output */
+} regname;
+/* Each register name must be added as a field to this structure. */
+typedef struct
+  #define FIRST_REGISTER r_pixel
+  regname r_pixel;
+  regname r_inptr;
+  regname r_inshift;
+  regname r_inword;
+  regname r_maskinptr;
+  regname r_maskinword;
+  regname r_maskinshift;
+  regname r_masko;
+  regname r_temp1;
+  regname r_temp2;
+  regname r_c1632;
+  regname r_oditheradd;
+  regname r_blockroutine;
+  regname r_ecfindex;
+  regname r_bgcolour;
+  regname r_fetchroutine;
+  regname r_outptr;
+  regname r_outword;
+  regname r_outshift;
+  regname r_table;
+  regname r_xsize;
+  regname r_xcount;
+  regname r_ysize;
+  regname r_ycount;
+  regname r_inoffset;
+  regname r_maskinoffset;
+  regname r_in_pixmask;    /* only used by 2-at-a-time loop */
+  regname r1;
+  regname r2;
+  regname r3;
+  regname wp;
+  regname sp;
+  regname lr;
+  regname pc;
+  #define LAST_REGISTER pc
+} regnames_rec;
+#define FOR_EACH_REGISTER_NAME(ptr) for (ptr = &ws->regnames.FIRST_REGISTER; ptr <= &ws->regnames.LAST_REGISTER; ptr++)
+#if 0
+#define R(reg) rr(&ws->regnames.reg)
+static int rr(regname *r)
+  /* Makes code bigger, lots of string clashes - rats! */
+  if (r->regno == -1) tracef("Register %s not defined\n" _ r->name);
+  assert(r->regno != -1, ERROR_FATAL);
+  return r->regno;
+#define R(reg) rr(ws->regnames.reg.regno)
+static int rr(int r)
+  /* Assert that the register is at least set */
+  assert(r != -1, ERROR_FATAL);
+  return r;
+/* The structure containing all workspace - essentially our static variables. */
+#define CHECK_CODE 123456789
+typedef struct
+  /* Initialisation */
+  int  check_code;
+  /* Code buffer management */
+  int  build_buffer;             /* Buffer currently being built, or next to build */
+  int *compile_base;
+  int *compile_ptr;              /* where to put next instruction */
+  int *compile_lim;
+  /* Label control and allocation */
+  labels_rec labels;             /* each label, and where it is in the generated code */
+  /* Register control and allocation */
+  regnames_rec regnames;         /* physical assignment of each register name */
+  int  next_free_reg;            /* allocator of physical registers, as they are needed. */
+  BOOL leave_r12_alone;          /* Leave assembler 'wp' in place during compiled code */
+  int  gcol;                     /* GCOL action */
+  BOOL masked;                   /* whether to use mask */
+  BOOL mask1bpp;                 /* whether mask is 1bpp mask */
+  int  odither;                  /* If 0, then there's no ordered dither. If non-0, number of bits - 1 being truncated by dither. */
+#if 0
+  int  odither_eorvalue;         /* value for eor alternation along a line */
+  int  odither_shift;            /* offset of two-bit dither value in r_oditheradd */
+  /* Assemble-time constants */
+  int  in_bpp;
+  int  in_bpc;                   /* Same as bpp unless double-pixel, in which case double bpp */
+  int  in_pixmask;
+  int  mask_bpp;
+  int  mask_bpc;
+  int  mask_pixmask;
+  int  out_l2bpp;                /* not provided in wp */
+  int  out_l2bpc;                /* ditto */
+  int  out_pixmask;              /* mask for one pixel */
+  int  out_dpixmask;
+  int  out_ppw;                  /* pixels per word */
+  int  out_l2ppw;
+  BOOL cal_table_simple;         /* If true, a simple table lookup is possible */
+  /* Space for compiled code, near the end so most field accesses have only a small offset. */
+  code_buffer buffers[NBUFFERS];
+  /* Check for workspace overwritten */
+  int  check_code2;
+} workspace;
+static void check_workspace(workspace *ws)
+/* Basic validity checks, and initialise if this is the first time. */
+  assert(ws != 0, ERROR_NO_MEMORY);
+  if (ws->check_code != CHECK_CODE)
+  {
+    code_buffer *p;
+    tracef("Initialising workspace.\n");
+    ws->check_code = CHECK_CODE;
+    ws->check_code2 = CHECK_CODE;
+    ws->build_buffer = 0;
+    FOR_EACH_BUFFER(p) p->key_word = -1;
+#ifdef DEBUG
+    {
+      label *l;
+      /* Set up textual names of all the labels */
+      FOR_EACH_LABEL(l) l->name = 0;
+      #define LN(lname) ws->labels.lname.name = #lname;
+      LN(loop_y_repeat)
+      LN(test1)
+      LN(test2)
+      LN(loop_x_enter)
+      LN(loop_x_repeat)
+      LN(loop_x_exit)
+      LN(l_masked)
+      LN(loop_put_pixel_repeat)
+      LN(loop_put_masked_repeat)
+      LN(y_loop)
+      LN(y_loop_enter)
+      LN(y_loop_exit)
+      LN(loop_delay)
+      LN(x_evenstart)
+      LN(x_oddmask)
+      LN(x_aligned_loop)
+      LN(x_aligned_enter)
+      LN(x_alignmask1)
+      LN(x_alignmask2)
+      LN(x_misaligned)
+      LN(x_misaligned_loop)
+      LN(x_misaligned_enter)
+      LN(x_misalignmask1)
+      LN(x_misalignmask2)
+      LN(x_2atatime_exit)
+      LN(x_lastmask)
+      LN(loop_x_exit1)
+      LN(loop_x_exitskip)
+      LN(loop1)
+      LN(loop2)
+      LN(plot_loopa)
+      LN(plot_loop1)
+      LN(plot_loop1a)
+      LN(plot_loop1b)
+      LN(plot_loop1c)
+      LN(plot_loop2)
+      LN(plot_loop3)
+      LN(plot_loop4)
+      LN(plot_loop4a)
+      LN(plot_loop4b)
+      LN(plot_loop4c)
+      LN(last)
+      /* Check he's got them all */
+      FOR_EACH_LABEL(l) assert(l->name != 0, ERROR_FATAL);
+    }
+    {
+      regname *r;
+      FOR_EACH_REGISTER_NAME(r) r->name = 0;
+      #define RNN(rname) ws->regnames.rname.name = #rname;
+      RNN(r_pixel)
+      RNN(r_inptr)
+      RNN(r_inshift)
+      RNN(r_inword)
+      RNN(r_maskinptr)
+      RNN(r_maskinword)
+      RNN(r_maskinshift)
+      RNN(r_masko)
+      RNN(r_temp1)
+      RNN(r_temp2)
+      RNN(r_c1632)
+      RNN(r_oditheradd)
+      RNN(r_blockroutine)
+      RNN(r_ecfindex)
+      RNN(r_bgcolour)
+      RNN(r_fetchroutine)
+      RNN(r_outptr)
+      RNN(r_outword)
+      RNN(r_outshift)
+      RNN(r_table)
+      RNN(r_xsize)
+      RNN(r_xcount)
+      RNN(r_ysize)
+      RNN(r_ycount)
+      RNN(r_inoffset)
+      RNN(r_maskinoffset)
+      RNN(r_in_pixmask)
+      RNN(r1)
+      RNN(r2)
+      RNN(r3)
+      RNN(wp)
+      RNN(sp)
+      RNN(lr)
+      RNN(pc)
+      FOR_EACH_REGISTER_NAME(r) assert(r->name != 0, ERROR_FATAL);
+      ws->leave_r12_alone = FALSE;
+    }
+  }
+  assert(ws->check_code2 == CHECK_CODE, ERROR_FATAL);
+#ifdef DEBUG
+static void dump_asm_workspace(asm_workspace *wp)
+  /* Oddly spaced out to allow it to be easily lined up with the structure definition */
+  tracef("Assembler workspace at %x:\n" _ wp);
+  tracef("save_outoffset=%i        %t32. byte offset between output rows - SUBTRACT for next row.\n" _ wp->save_outoffset);
+  tracef("save_inoffset=%i         %t32. byte offset between input rows - SUBTRACT for next row.\n" _ wp->save_inoffset);
+  tracef("save_inptr=0x%x          %t32. word address of input pixels.\n" _ wp->save_inptr);
+  tracef("save_outptr=0x%x         %t32. address of word containing first output pixel.\n" _ wp->save_outptr);
+  tracef("save_ydiv=%i             %t32. subtracter value for y scale.\n" _ wp->save_ydiv);
+  tracef("save_yadd=%i             %t32. adder value for y scale.\n" _ wp->save_yadd);
+  tracef("save_ysize=%i            %t32. number of output rows.\n" _ wp->save_ysize);
+  tracef("save_ycount=%i           %t32. total of ymag/ydiv sum, for y scale factor\n" _ wp->save_ycount);
+  newline();
+  tracef("save_inshift=%i          %t32. bit shift of first pixel.\n" _ wp->save_inshift);
+  tracef("save_xsize=%i            %t32. number of output pixels per row.\n" _ wp->save_xsize);
+  tracef("save_xcount=%i           %t32. total of xmag/xdiv sum, for x scale factor\n" _ wp->save_xcount);
+  tracef("save_ecfptr=0x%x         %t32. ECF pointer - only useful if plotting the mask.\n" _ wp->save_ecfptr);
+  tracef("save_ecflimit=0x%x       %t32. ECF limit - only useful if plotting the mask.\n" _ wp->save_ecflimit);
+  tracef("save_xdiv=%i             %t32. subtracter value for x scale.\n" _ wp->save_xdiv);
+  tracef("save_xadd=%i             %t32. adder value for x scale\n" _ wp->save_xadd);
+  newline();
+  tracef("save_masko=%i            %t32. if not 1bpp mask then this is mask data offset from inptr. Otherwise...\n" _ wp->save_masko);
+  tracef("save_xcoord=%i           %t32. pixel x coordinate of first output pixel.\n" _ wp->save_xcoord);
+  tracef("save_ycoord=%i           %t32. pixel y coordinate of first output pixel.\n" _ wp->save_ycoord);
+  tracef("save_xmag=%i             %t32. adder value for x scale?\n" _ wp->save_xmag);
+  tracef("save_ymag=%i             %t32. adder value for y scale?\n" _ wp->save_ymag);
+  newline();
+  tracef("save_inlog2bpp=%i        %t32. log 2 bits per pixel of input.\n" _ wp->save_inlog2bpp);
+  tracef("save_inlog2bpc=%i        %t32. log 2 bits per character of input (only different for double-pixels).\n"
+                                   _ wp->save_inlog2bpc);
+  tracef("save_mode=%i (>>27 = %i) %t32. mode number/pointer of sprite - 1bpp sprites have hi bits set.\n" _ wp->save_mode _ wp->save_mode >> 27);
+  newline();
+  tracef("save_maskinshift=%i      %t32. initial bit shift within mask word.\n" _ wp->save_maskinshift);
+  tracef("save_maskinptr=0x%x      %t32. word address of mask (or 0 if there isn't one).\n" _ wp->save_maskinptr);
+  tracef("save_maskinoffset=%i     %t32. byte offset between mask rows - SUBTRACT for next row.\n" _ wp->save_maskinoffset);
+  newline();
+  tracef("BPP=%i                   %t32. bits per pixel of output.\n" _ wp->BPP);
+  tracef("BPC=%i                   %t32. bits per character of output (only different for double pixels).\n" _ wp->BPC);
+  tracef("ColourTTR=0x%x           %t32. translation table or palette.\n" _ wp->ColourTTR);
+  tracef("trns_palette=0x%x        %t32. if non-0 ignore TTR and use this palette instead.\n" _ wp->trns_palette);
+  tracef("spritecode=%i (& 255 = %i) %t32. SpriteOp - 52 for PutSpriteScaled, 50 for PlotMaskScaled.\n" _ wp->spritecode _ wp->spritecode & 255);
+  tracef("bgcolour=%i              %t32. Background colour (only valid if plotting the mask)\n" _ wp->bgcolour);
+  newline();
+static void dump_workspace(workspace *ws)
+  code_buffer *p;
+  tracef("Dumping workspace.\n");
+  #define DUMPINT(field) tracef("%s = %i.\n" _ #field _ ws->field);
+  DUMPINT(build_buffer)
+  FOR_EACH_BUFFER(p) tracef("buffer->keyword = %i.\n" _ p->key_word);
+*                                                                         *
+*    Low-level instruction generation.                                    *
+*                                                                         *
+/* Condition codes */
+#define EQ 0xf0000000      /* It's 0 really - frigged so that 0 can be 'always' - the usual case. */
+#define NE 0x10000000
+#define CS 0x20000000
+#define CC 0x30000000
+#define MI 0x40000000
+#define PL 0x50000000
+#define VS 0x60000000
+#define VC 0x70000000
+#define HI 0x80000000
+#define LS 0x90000000
+#define GE 0xa0000000
+#define LT 0xb0000000
+#define GT 0xc0000000
+#define LE 0xd0000000
+#define AL 0xe0000000
+#define NV 0xDONOTUSE
+/* Branches */
+#define B  0x0a000000
+#define BL 0x0b000000
+#define B_OFFSET_MASK 0x00ffffff /* and with this for negative offsets */
+/* ALU ops */
+#define S  (1<<20)
+#define AND(dst,op1,rest,str)      ins(ws,(0x0 << 21) | DSTR(dst) | OP1R(op1) | (rest), str)
+#define EOR(dst,op1,rest,str)      ins(ws,(0x1 << 21) | DSTR(dst) | OP1R(op1) | (rest), str)
+#define SUB(dst,op1,rest,str)      ins(ws,(0x2 << 21) | DSTR(dst) | OP1R(op1) | (rest), str)
+#define RSB(dst,op1,rest,str)      ins(ws,(0x3 << 21) | DSTR(dst) | OP1R(op1) | (rest), str)
+#define ADD(dst,op1,rest,str)      ins(ws,(0x4 << 21) | DSTR(dst) | OP1R(op1) | (rest), str)
+#define ADC(dst,op1,rest,str)      ins(ws,(0x5 << 21) | DSTR(dst) | OP1R(op1) | (rest), str)
+#define SBC(dst,op1,rest,str)      ins(ws,(0x6 << 21) | DSTR(dst) | OP1R(op1) | (rest), str)
+#define RSC(dst,op1,rest,str)      ins(ws,(0x7 << 21) | DSTR(dst) | OP1R(op1) | (rest), str)
+#define TST(op1,rest,str)          ins(ws,(0x8 << 21) | S | OP1R(op1) | (rest), str)
+#define TEQ(op1,rest,str)          ins(ws,(0x9 << 21) | S | OP1R(op1) | (rest), str)
+#define CMP(op1,rest,str)          ins(ws,(0xa << 21) | S | OP1R(op1) | (rest), str)
+#define CMN(op1,rest,str)          ins(ws,(0xb << 21) | S | OP1R(op1) | (rest), str)
+#define ORR(dst,op1,rest,str)      ins(ws,(0xc << 21) | DSTR(dst) | OP1R(op1) | (rest), str)
+#define MOV(dst,rest,str)          ins(ws,(0xd << 21) | DSTR(dst) | (rest), str)
+#define BIC(dst,op1,rest,str)      ins(ws,(0xe << 21) | DSTR(dst) | OP1R(op1) | (rest), str)
+#define MVN(dst,rest,str)          ins(ws,(0xf << 21) | DSTR(dst) | (rest), str)
+#define ADD_OPCODE (0x4 << 21)
+#define SUB_OPCODE (0x2 << 21)
+#define MOV_OPCODE (0xd << 21)
+#define DSTR(x) ((x) << 12)          /* destination - ignored by TST/TEQ/CMP/CMN */
+#define OP1R(x) ((x) << 16)          /* first operand */
+#define OP2R(x) ((x) << 0)           /* if !IMM */
+#define IMM(x) ((x) | (1<<25))       /* an 8-bit unsigned field */
+#define IMMROR(x) ((x) << 7)         /* an EVEN number to rotate right IMM by */
+#define LSLI(x) (((x) << 7) | 0x00)   /* 5-bit immed shift applied to OP2R */
+#define LSRI(x) (((x) << 7) | 0x20)
+#define ASRI(x) (((x) << 7) | 0x40)
+#define RORI(x) (((x) << 7) | 0x60)
+#define LSLR(x) (((x) << 8) | 0x10)   /* shift register applied to OP2R */
+#define LSRR(x) (((x) << 8) | 0x30)
+#define ASRR(x) (((x) << 8) | 0x50)
+#define RORR(x) (((x) << 8) | 0x70)
+/* Load and store ops */
+#define LDR(reg,basereg)  (0x04100000 | ((reg) << 12)| ((basereg) << 16))
+#define STR(reg,basereg)  (0x04000000 | ((reg) << 12)| ((basereg) << 16))
+#define LDRB(reg,basereg) (0x04500000 | ((reg) << 12)| ((basereg) << 16))
+#define STRB(reg,basereg) (0x04400000 | ((reg) << 12)| ((basereg) << 16))
+#define WRITEBACK (1 << 21)
+#define ADDOFFSET (1 << 23) /* else subtract */
+#define PREADD (1 << 24) /* else post */
+#define OFFSET(x) (PREADD | ADDOFFSET | (x))        /* normal simple index */
+#define NEGOFFSET(x) (PREADD | (x))                 /* subtract offset */
+#define PREDEC(x) (WRITEBACK | PREADD | (x))
+#define POSTINC(x) (ADDOFFSET | (x))                /* The manual says, do not set WRITEBACK if doing post-addition */
+#define POSTDEC(x) ((x))                            /* writeback will always occur, setting it is does LDRT/LDRBT */
+#define PUSH (0x08000000 | (13<<16) /* register 13 */ \
+                         | (1<<21) /* write-back */ \
+                         | (1<<24) /* add offset before transfer */)
+#define POP  (0x08000000 | (13<<16) /* register 13 */ \
+                         | (1<<20) /* load from memory */ \
+                         | (1<<21) /* write-back */ \
+                         | (1<<23) /* add, not subtract */ )
+#define LDMIA(reg) (0x08000000 | (reg<<16) /* register to load from */ \
+                               | (1<<20) /* load from memory */ \
+                               | (1<<23) /* add, not subtract */ )
+#define STMIA(reg) (0x08000000 | (reg<<16) /* register to load from */ \
+                               | (1<<23) /* add, not subtract */ )
+/* Supervisor call */
+#define SWI(swino) (0x0F000000 | swino)
+/* Indexed load - LSL shift assumed - writeback or negative not covered */
+#define INDEX(reg, shift) ((1<<25) | OFFSET(0) | OP2R(reg) | LSLI(shift))
+/* Offset in assembler workspace */
+#define WP_OFFSET(field) OFFSET(((char*)&(wp->field)) - ((char*)&(wp->WP_FIRST_FIELD)))
+/* Define an assembler register */
+#define RN(name,no,describe) set_regname(ws, &ws->regnames.name, no, describe);
+#ifdef DEBUG
+static void ldm_reg_list(workspace *ws, char *a, int regmask, BOOL lastname)
+/* Construct a string in a which can be placed in curly brackets, describing
+ * a LDM/STM instruction. If lastname then find the last such register name in
+ * the case of duplicates - eg. the y-loop name rather than the x-loop name
+ * for the same physical register.
+ */
+  int i;
+  regname *r;
+  BOOL found;
+  char *aptr;
+  a[0] = 0;
+  for (i = 0; i <= 15; i++) /* for each physical register */
+  {
+    if ((regmask & (1<<i)) != 0) /* find a name for this register */
+    {
+      found = FALSE;
+      aptr = a;
+      while (*aptr != 0) aptr++; /* points at the null at the end of the string */
+      {
+        if (r->regno == i)
+        {
+          *aptr = 0; /* If lastname and finding it again, delete last one */
+          if (a[0] != 0) strcat(aptr, ",");
+          strcat(aptr, r->name);
+          found = TRUE;
+          if (!lastname) break;
+        }
+      }
+      assert(found, ERROR_FATAL);
+    }
+  }
+#ifdef DEBUG
+static void ins(workspace *ws, int w, char *description)
+#define ins(ws,w,description) do_ins(ws,w)
+static void do_ins(workspace *ws, int w)
+/* Put an instruction into the output buffer.
+ * When debugging an assembler listings is generated too. These can be fed through
+ * objasm, and the results compared with the opcodes that I generate.
+ * Columns of assembler output:
+ * addressX  opcodeXX  label   opcodes regs                            comment
+ * ^0        ^10       ^20     ^28     ^36                             ^68
+ */
+  int ccode = w & 0xf0000000;
+  /* Handle the AL/EQ condition codes being wrong, so that 0 can be AL elsewhere. */
+  if (ccode == 0xf0000000) w = w & 0x0fffffff;   /* EQ code */
+  else if (ccode == 0) w = w | 0xe0000000;       /* AL code */
+  /* All others are per the ARM expects */
+  tracef("%x  %x  %t28.%s\n" _
+    (ws->compile_ptr - ws->compile_base) * sizeof(int) _
+    w _ description); /* pseudo-assembler format of output */
+  assert(ws->compile_ptr < ws->compile_lim, ERROR_NO_MEMORY); /* Check the buffer is big enough */
+  *(ws->compile_ptr)++ = w; /* Store at then increment P% */
+#ifdef DEBUG
+#define DEFINE_LABEL(lab,describe) define_label(ws, L(lab), describe);
+static void define_label(workspace *ws, label *lab, char *description)
+#define DEFINE_LABEL(lab,describe) define_label(ws, L(lab));
+static void define_label(workspace *ws, label *lab)
+/* Define a label, and fill in a forward reference to it if necessary. */
+   assert(lab->def == 0, ERROR_FATAL); /* Check not defined twice */
+   lab->def = ws->compile_ptr;
+   tracef("%t20.%s%t68.; %s\n" _ lab->name _ description);
+   if (lab->ref != 0)
+   {
+     int newvalue = *(lab->ref) | (B_OFFSET_MASK & (lab->def - (lab->ref + 2))); /* compute offset */
+     tracef("%t20.; Zapping forward ref instruction at %x to be %x.\n" _
+       sizeof(int) * (lab->ref - ws->compile_base) _ newvalue);
+     *(lab->ref) = newvalue;
+     lab->ref = 0;
+   }
+#ifdef DEBUG
+static void branch(workspace *ws, unsigned int opcode, label *lab, char *description)
+#define branch(ws,opcode,lab,description) do_branch(ws,opcode,lab)
+static void do_branch(workspace *ws, unsigned int opcode, label *lab)
+/* Compile a branch instruction to a label. The opcode includes the condition code. */
+  if (lab->def == 0) /* Forward reference */
+  {
+#ifdef DEBUG
+    if (lab->ref != 0)
+      tracef("Already referenced at 0x%x\n" _ sizeof(int) * (lab->ref - ws->compile_base));
+    assert(lab->ref == 0, ERROR_FATAL); /* Check for two forward refs to same label */
+    lab->ref = ws->compile_ptr;
+    ins(ws, opcode, description); /* Just give as offset 0 for now */
+  }
+  else
+  {
+    assert(lab->ref == 0, ERROR_FATAL);
+    ins(ws,
+      opcode | (B_OFFSET_MASK & (lab->def - (ws->compile_ptr + 2))), description);
+  }
+#ifdef DEBUG
+static void set_regname(workspace *ws, regname *r, int regno, char *describe)
+#define set_regname(ws,r,regno,describe) do_set_regname(ws,r,regno)
+static void do_set_regname(workspace *ws, regname *r, int regno)
+/* Allocate a physical register number. If regno is -1 then allocate an
+ * as-yet-unused one, otherwise it's a specific register number.
+ */
+  if (regno == -1) /* allocate a number, one of 0..12 */
+  {
+    regno = ws->next_free_reg;
+    ws->next_free_reg++;
+    assert(regno >= 0 && regno <= 12, ERROR_FATAL); /* Check for register overflow */
+    if (regno == 12) assert(!ws->leave_r12_alone, ERROR_FATAL);
+  }
+  r->regno = regno;
+  tracef("%t20.%s%t27 RN %t36.%i %t68.; %s\n" _ r->name _ r->regno _ describe);
+static void align16(asm_workspace *wp, workspace *ws)
+/* Align next instruction to quadword boundary */
+  UNUSED(wp);
+  while (((int) ws->compile_ptr) & 15 != 0)
+    MOV(R(r_pixel), OP2R(R(r_pixel)),                        "MOV     r_pixel,r_pixel                 ; align to 16-byte boundary");
+#if defined(DEBUG_TML) && defined(DEBUG)
+static void write_reg(workspace *ws, regname *reg)
+/* Sppol the register to the TML hardware */
+    comment(ws, "Write Register to TML card");
+    tracef("Register to be output is... %s\n" _ reg->name);
+    ins(ws, PUSH | (1<<10) | (1<<11) | 1 | (1<<1) | (1<<14),        "STMDB   sp!,{r0,r1,r10,r11,r14}          ; prepare to call SWI");
+    ins(ws, MOV_OPCODE | DSTR(1) | OP2R(reg->regno),                "MOV     r1,r_somereg");
+    ins(ws, MOV_OPCODE | DSTR(0) | OP2R(1),                         "MOV     r0,r1");
+    AND(0, 0, IMM(0xff),                                            "AND     r0,r0,#255 ");
+    ins(ws, SWI(HostFS_WriteC),                                     "SWI     HostFS_WriteC                    ; convert r1 value");
+    ins(ws, MOV_OPCODE | DSTR(0) | OP2R(1) | LSRI(8),               "MOV     r0,r1 LSR #8");
+    AND(0, 0, IMM(0xff),                                            "AND     r0,r0,#255                               ");
+    ins(ws, SWI(HostFS_WriteC),                                     "SWI     HostFS_WriteC                    ; convert r1 value");
+    ins(ws, MOV_OPCODE | DSTR(0) | OP2R(1) | LSRI(16),              "MOV     r0,r1 LSR #16");
+    AND(0, 0, IMM(0xff),                                            "AND     r0,r0,#255                               ");
+    ins(ws, SWI(HostFS_WriteC),                                     "SWI     HostFS_WriteC                    ; convert r1 value");
+    ins(ws, MOV_OPCODE | DSTR(0) | OP2R(1) | LSRI(24),              "MOV     r0,r1 LSR #24");
+    AND(0, 0, IMM(0xff),                                            "AND     r0,r0,#255                               ");
+    ins(ws, SWI(HostFS_WriteC),                                     "SWI     HostFS_WriteC                    ; convert r1 value");
+    ins(ws, POP | (1<<10) | (1<<11) | 1 | (1<<1) | (1<<14),         "LDMIA   sp!,{r0,r1,r10,r11,r14}          ; restore after calling SWI");
+    comment(ws, "");
+static void compile_buffer_init(asm_workspace *wp, workspace *ws)
+/* We intend to compile some code. Pick a buffer to use, and set up
+ * for generating into it. We use a simple round-robin for reusing buffers,
+ * rather than attempting to do LRU.
+ */
+  label *p;
+  regname *r;
+  code_buffer *b = &(ws->buffers[ws->build_buffer]);
+  ws->compile_base = &(b->code[0]);
+  ws->compile_ptr = ws->compile_base;
+  ws->compile_lim = ws->compile_base + BUFSIZE;
+  FOR_EACH_LABEL(p) {p->def = 0; p->ref = 0;} /* zap all the labels to be undefined. */
+  FOR_EACH_REGISTER_NAME(r) r->regno = -1;
+  ws->next_free_reg = 0; /* allocate registers from 0 */
+  tracef("Compile buffer initialised.\n");
+  tracef("%t20; Blitting code for %s, scale factors %i:%i,%i:%i outoffset %x\n" _
+    (PLOTMASK ? "PlotMaskScaled" : "PutSpriteScaled") _
+    b->xadd - b->xdiv _ b->xdiv _ b->yadd _ b->ydiv _ wp->save_outoffset);
+  tracef("%t20; gcol action=%i in-bpp=%i out-bpp=%i in-dpix=%s out-dpix=%s mask=%s 1bppmask=%s palette=%s table=%s\n" _
+    ws->gcol _ (1<<wp->save_inlog2bpp) _ wp->BPP _
+    whether(DPIXEL_INPUT) _ whether(DPIXEL_OUTPUT) _
+    whether(SOURCE_MASK) _ whether(SOURCE_BPPMASK) _
+    whether(wp->trns_palette != 0) _ whether(wp->ColourTTR != 0));
+  tracef("%t20.; Generated by compiler of (%s %s)\n" _ __DATE__ _ __TIME__);
+  comment(ws, "Get register and workspace definitions, turn on listing");
+  tracef("%t28.GET     w.GenHdr\n");
+  tracef("%t28.OPT     1\n");
+  RN(r1, 1, "r1");
+  RN(r2, 2, "r2");
+  RN(r3, 3, "r3");
+  RN(wp, 12, "workspace pointer")
+  RN(sp, 13, "stack pointer")
+  RN(lr, 14, "link register")
+  RN(pc, 15, "program counter")
+  ws->leave_r12_alone = FALSE;  /* by default, compiled code does not have module workspace pointer */
+  UNUSED(wp);
+static void compile_buffer_done(workspace *ws)
+/* Finished compiling code sequence. */
+#ifdef DEBUG
+  label *p;
+  tracef("%t28.END\n");
+  tracef("Compile buffer done, %i words generated.\n" _ ws->compile_ptr - ws->compile_base);
+  /* Increment pointer for next buffer to reuse. */
+  ws->build_buffer++;
+  if (ws->build_buffer >= NBUFFERS) ws->build_buffer = 0;
+#ifdef DEBUG
+  /* Check no unresolved references to labels */
+  {
+    IFDEBUG(if(p->ref != 0) tracef("Unresolved reference to label %s at %x\n" _ p->name _ sizeof(int) * (p->ref - ws->compile_base));)
+    assert(p->ref == 0, ERROR_FATAL);
+  }
+  /* ws->compile_base can be used as the base of the resulting procedure. */
+*                                                                         *
+*    Test main entry sequence, low level IO and code generation.          *
+*                                                                         *
+blitter putscaled_compiler(asm_workspace *wp, workspace *ws, workspace *ws_end, int gcol)
+  /* Test low-level output */
+  writes(0, "Hello there!\nhi!\n");
+  writech(0, 'X');
+  writehex(0, 0x5732abcd, 8);
+  writech(0, '_');
+  tracef("Test tracef, esc:%%, string:'%s', char:'%c', int:'%i', hex:'%x'.\n",
+    "hello", 'X', 1234567, 0x6789abcd);
+  /* Check that the assembler has an adequate opinion of our workspace needs. */
+  tracef("wp=%x ws=%x ws_end=%x.\n" _ wp _ ws _ ws_end);
+  tracef("Size of assembler workspace: %i.\n" _ ((char*)ws) - ((char*)wp));
+  tracef("Size of C workspace: %i.\n" _ ((char*)ws_end) - ((char*)ws));
+  assert(ws_end > ws, ERROR_NO_MEMORY);
+  check_workspace(ws);
+  dump_workspace(ws);
+  compile_buffer_init(wp, ws);
+  /* Check compilation of all instruction forms. The resulting trace output can then
+   * have its binary details stripped, be run through objasm, and the resulting listings
+   * compared to check the bit patterns generated.
+   */
+  ADD(5, 6, GT | OP2R(7),                             "ADDGT   r5,r6,r7");
+  branch(ws, B, L(loop_y_exit),                       "B       loop_y_exit");
+  ORR(2, 3, S | IMM(23),                              "ORRS    r2,r3,#23");
+  DEFINE_LABEL(test1, "test label")
+  EOR(1, 2, EQ | IMM(255) | IMMROR(10),               "EOREQ   r1,r2,#(255:ROR:10)");
+  comment(ws, "This is a comment");
+  DEFINE_LABEL(test2, "test label")
+  branch(ws, B + NE, L(loop_y_repeat),                "BNE     loop_y_repeat");
+  branch(ws, BL + EQ, L(loop_y_repeat),               "BLEQ    loop_y_repeat");
+  CMP(8, OP2R(9) | LSLI(12),                          "CMP     r8,r9,LSL #12");
+  CMP(8, OP2R(9) | ASRR(6),                           "CMP     r8,r9,ASR r6");
+  MOV(3, OP2R(4) | RORI(1),                           "MOV     r3,r4,ROR #1");
+  MOV(R(pc)), OP2R(R(lr)),                            "MOV     pc,lr");
+  ins(ws, LDR(8,3) | OFFSET(249),                     "LDR     r8,[r3,#249]");
+  ins(ws, LDR(8,3) | OFFSET(0),                       "LDR     r8,[r3]");
+  ins(ws, STR(1,2) | GT + NEGOFFSET(12),              "STRGT   r1,[r2,#-12]");
+  ins(ws, LDRB(1,2) | PREINC(4),                      "LDRB    r1,[r2,#4]!");
+  ins(ws, STRB(6,7) | POSTINC(4),                     "STRB    r6,[r7],#4");
+  ins(ws, LDRB(1,2) | PREDEC(4),                      "LDRB    r1,[r2,#-4]!");
+  ins(ws, STRB(6,7) | POSTDEC(4),                     "STRB    r6,[r7],#-4");
+  ins(ws, LDRB(8,3) | INDEX(4,0),                     "LDRB    r8,[r3,r4]");
+  ins(ws, LDR(8,3) | INDEX(4,2),                      "LDR     r8,[r3,r4,LSL #2]");
+  ins(ws, PUSH | GT | (1<<4) | (1<<5) | (1<<6),       "STMGTDB sp!,{r4,r5,r6}");
+  ins(ws, POP | (1<<4) | (1<<5) | (1<<6),             "LDMIA   sp!,{r4,r5,r6}");
+  ins(ws, POP | (1<<4) | (1<<5) | (1<<6) | (1<<R(pc)),"LDMIA   sp!,{r4,r5,r6,pc}");
+  {
+    char a[256];
+    char a2[256];
+    int regmask = (1<<13) | (1<<15); /* pretty arbitrary silly one actually */
+    ldm_reg_list(ws, a, regmask, FALSE);
+    do_sprintf(a2, "LDMIA   lr,{%s}", a)
+    ins(ws, LDMIA(lr) | regmask, a2);
+    do_sprintf(a2, "STMIA   l4,{%s}", a)
+    ins(ws, STMIA(lr) | regmask, a2);
+  }
+  branch(ws, BL + EQ, &ws->labels.loop_y_repeat,                        "BLEQ    loop_y_repeat");
+  compile_buffer_done(ws);
+  writes(0, "Exit.\n");
+*                                                                         *
+*    Bitblit: Evaluate conditions.                                        *
+*                                                                         *
+static BOOL simple_x_scale(asm_workspace *wp, workspace *ws)
+/* Return true if 1:1 along x */
+  return (  wp->save_xadd - wp->save_xdiv == wp->save_xdiv
+         && wp->save_xdiv <= wp->save_xcount
+         && !PLOTMASK
+         && ws->gcol == 0
+         && !ws->odither /* CAN be done, but the code sequences get awfully big so let's cut it out for now. */
+         ? TRUE : FALSE);
+  /* Without the second test we MIGHT have to omit the first pixel, which the 1:1 code doesn't allow for. */
+  /* The 2-at-a-time loop doesn't allow for PLOTMASK - not important enough. */
+  /* The 2-at-a-time loop doesn't allow for any gcol but 0 - not important enough. */
+static BOOL x_block_move(asm_workspace *wp, workspace *ws)
+/* Returns true if the inner loop is the simple movement of a block of bits */
+  return (  simple_x_scale(wp, ws)
+         && wp->BPC == (1<<wp->save_inlog2bpc)
+         && ws->gcol == 0
+         && !SOURCE_MASK
+         && !SOURCE_TABLE
+         && wp->cal_table == 0
+         ? TRUE : FALSE);
+static BOOL simple_y_scale(asm_workspace *wp, workspace *ws)
+/* Return true if 1:1 along y */
+  UNUSED(ws);
+  return wp->save_yadd == wp->save_ydiv;
+static int palette_is_grey(int *palette, int entries)
+/* Scan a palette looking how they increment to deduce if it's just greyscale */
+  int loop;
+  int entry;
+  int ascending = 1;
+  for (loop=0;loop<entries;loop++)
+  {
+    entry = palette[loop];
+    if (((entry ^ (entry>>8)) & 0xffff00) != 0)
+      return 0;
+    if ((entry & 0xff00)>>8 != loop)
+      ascending = 0;
+  }
+  if (ascending)
+   return 2;
+  return 1;
+*                                                                         *
+*    Bitblit: Register allocation.                                        *
+*                                                                         *
+static void ptrs_rn(asm_workspace *wp, workspace *ws)
+/* Declare the pointer registers, which must be visible in both the x-loop and the y-loop */
+  /* r_pixel is always needed, and need not be saved between loops.
+   * So, we put it in r14 to remove the need for the register allocator
+   * to worry about r14.
+   */
+  RN(r_pixel, 14, "fetched and translated pixel")
+  /* In most cases there are not enough registers, and the control of
+   * the outer (y) loop requires swapping two 'banks' of registers.
+   * inptr, outptr (and maskinptr if it exists) are always registers
+   * r0, r1, r2, and they are visible when the y registers are swapped in.
+   */
+  RN(r_inptr, -1, PLOTMASK ? "ECF pattern pointer" : "input word pointer")
+  RN(r_outptr, -1, "word pointer to output")
+  if (SOURCE_BPPMASK || PLOTMASK) RN(r_maskinptr, -1, "mask input word pointer")
+  if (ws->odither) RN(r_oditheradd, -1, "ordered dither offset value")
+  /* The initial dither add value needs to be changed for every output line,
+   * so it helps to have r_oditheradd visible in the y loop
+   */
+static void xloop_rn(asm_workspace *wp, workspace *ws)
+/* Other variables for the x-loop */
+  int need_temps = 0; /* set to 1 or 2 if temp1 and temp2 are needed */
+  if (x_block_move(wp, ws))
+  {
+    /* X loop is very very simple, and communicates with machine-code block-shift routine. */
+    RN(r_inshift, -1, "Number of (most sig) bits of first input word to transfer, in 1..32")
+    RN(r_outshift, -1, "Number of (most sig) bits of first output word to fill, in 1..32")
+    RN(r_xsize, -1, "Number of bits to transfer per row")
+    RN(r_blockroutine, -1, "Block transfer routine")
+    /* Those registers had better be the same ones as the assembler code is expecting! */
+    assert(ws->regnames.r_inptr.regno == 0, ERROR_FATAL);
+    assert(ws->regnames.r_outptr.regno == 1, ERROR_FATAL);
+    assert(ws->regnames.r_inshift.regno == 2, ERROR_FATAL);
+    assert(ws->regnames.r_outshift.regno == 3, ERROR_FATAL);
+    assert(ws->regnames.r_xsize.regno == 4, ERROR_FATAL);
+  }
+  else
+  {
+    /* Normal case - declare whatever other registers are needed for fetching and translating pixels. */
+    if (PLOTMASK)
+      RN(r_inword, -1, "ECF pattern input word")
+    else if (!SOURCE_32_BIT) /* if not 32-bit source */
+    {
+      RN(r_inshift, -1, "bit shift of current pixel LSL #27")
+      RN(r_inword, -1, "current input word")
+    }
+    if (SOURCE_MASK)
+    {
+      RN(r_maskinword, -1, "current mask word")
+        RN(r_maskinshift, -1, "bit shift of current mask pixel")
+      else
+        RN(r_masko, -1, "offset of mask data from sprite data")
+    }
+    if (  need_temps == 0
+       && (ws->gcol != 0)
+       && DEST_32_BIT       /* use in save_pixel */
+       )
+       need_temps = 1;
+    if (PLOTMASK)
+    {
+      RN(r_ecfindex, -1, "index into ECF pattern")
+      RN(r_bgcolour, -1, "background plotting colour")
+    }
+    else
+    {
+      if (SOURCE_TABLE || wp->cal_table) RN(r_table, -1, "translation table or palette")
+      {
+        /* Work out whether we need 16->32 or 32->16 transformations, with their temp registers
+         * So, mirror the structure of translate_pixel
+         */
+        int pixl2bpp = wp->save_inlog2bpp;
+        if ((wp->trns_palette != 0) && (wp->BPP != 16)) pixl2bpp = 5;
+        if (pixl2bpp == 5 && wp->BPP != 32) need_temps = 2;
+        if (pixl2bpp == 4 && wp->BPP == 32)
+        {
+          need_temps = 2;
+          RN(r_c1632, -1, "constant for 16->32 transformation")
+        }
+      }
+      if ( need_temps == 0
+        && (wp->save_xmag % wp->save_xdiv) == 0
+        && (wp->save_xmag / wp->save_xdiv) > 4    /* used in optimised scale up */
+         )
+       need_temps = 1;
+    }
+    /* Declare whatever registers needed for saving the new pixel
+     * into the current destination pixel.
+     */
+    if (!DEST_32_BIT)
+    {
+      RN(r_outword, -1, "current output word")
+      RN(r_outshift, -1, "bit shift of current pixel in current output word LSL 27")
+    }
+    if (wp->save_inlog2bpp <= 3 && simple_x_scale(wp, ws))
+      /* going to use 2-at-a-time loop - if 16bpp or more, don't need this register. */
+      RN(r_in_pixmask, -1, "pixel mask for 2-at-a-time loop")
+    /* Declare whatever registers are needed for control of
+     * horizontal scaling. For some simple cases no scaling registers
+     * are needed.
+     */
+    RN(r_xsize, -1, "number of output pixels per row")
+    if (!simple_x_scale(wp, ws)) /* not 1:1 scale */
+      RN(r_xcount, -1, "total for x scale")
+      /* Adder and subractor values become constants in the code. */
+  }
+  /* The temporaries are shuffled to the end, so that if r12 (the assembler wp) is used then
+   * it does not get loaded before the y loop variables are initialised.
+   */
+  if (need_temps >= 1) RN(r_temp1, -1, "temp1 for pixel transformation temporary values")
+  if (need_temps >= 2) RN(r_temp2, -1, "temp2 for pixel transformation temporary values")
+  /* MAX POSSIBLE REQUIREMENT - 13, if vcount stuff not done.
+   * It may appear 15, but temp1 and temp2 are only needed if one of src/dst
+   * is 32bpp, in which case we save elsewhere.
+   * >>> AH not so, they are also needed if a palette is used, in which case
+   * the source can be fewer bpp. Ooops. Can we ever overflow? Not sure.
+   */
+static int yloop_rn_count(asm_workspace *wp, workspace *ws)
+/* Say how many registers yloop_rn will declare */
+  int result = 2;                                        /* r_ysize, r_inoffset */
+  if (wp->save_yadd != wp->save_ydiv) result++;          /* r_ycount */
+  if (SOURCE_BPPMASK || PLOTMASK) result++;              /* r_maskinoffset */
+  if (wp->is_it_jpeg) result++;                          /* r_fetchroutine */
+  return result;
+static void yloop_rn(asm_workspace *wp, workspace *ws)
+/* Declare whatever registers are needed for control of
+ * the vertical loop. These registers are part of a separate 'bank'
+ * from those in the central loop.
+ */
+  RN(r_ysize, -1, "number of output rows");
+  if (!simple_y_scale(wp, ws)) /* not 1:1 scale */
+    RN(r_ycount, -1, "total for y scale")
+  /* Adder and subractor values become constants in the code. */
+  RN(r_inoffset, -1, "byte offset between input rows.")
+  if (SOURCE_BPPMASK || PLOTMASK) RN(r_maskinoffset, -1, "byte offset between mask rows.")
+  if (wp->is_it_jpeg)             RN(r_fetchroutine, -1, "routine for getting row of decompressed JPEG data.")
+  /* MAX POSSIBLE REQUIREMENT - 5 registers */
+*                                                                         *
+*    Bitblit: Register initialisation.                                    *
+*                                                                         *
+/* Loading a constant index from the workspace pointer */
+#define LDR_WP(reg,value) ins(ws, LDR(R(reg),R(wp)) + WP_OFFSET(value), \
+                              "LDR     " #reg "," #value);
+#ifdef DEBUG
+  #define LDR_WP_C(reg,value, comment)                                \
+  {                                                                   \
+    char a[256];                                                      \
+    do_sprintf(a, "LDR     " #reg "," #value " %t40.; " comment);        \
+    ins(ws, LDR(R(reg),R(wp)) + WP_OFFSET(value), a);                 \
+  }
+  #define LDR_WP_C(reg,value, comment) ins(ws, LDR(R(reg),R(wp)) + WP_OFFSET(value), 0);
+/* Loading a constant index from a register */
+#ifdef DEBUG
+  #define LDR_INDEX(destreg,indexreg,offset,comment)                                      \
+  {                                                                                       \
+    char a[256];                                                                          \
+    do_sprintf(a, "LDR     " #destreg ",[" #indexreg ", #%i] %t40.; " comment, offset);      \
+    ins(ws, LDR(R(destreg),R(indexreg)) | OFFSET(offset), a);                             \
+  }
+  #define LDR_INDEX(destreg,indexreg,offset,comment) ins(ws, LDR(R(destreg),R(indexreg)) | OFFSET(offset), 0);
+static void get_in_shift(asm_workspace *wp, workspace *ws)
+/* Used within fetch_pixel_init, to load r_inshift. The complication is
+ * that if this is JPEG data then the save_inshift value was not calculated,
+ * because SpriteExtend assembler stuff thought this was 32bit data. This
+ * only matters if JPEG is being made to produce 8bpp or 16bpp data.
+ */
+  if (wp->is_it_jpeg && wp->save_inlog2bpp != 5)
+  {
+    LDR_WP_C(r_inshift, in_x, "input x coord (JPEG input data)")
+    if (wp->save_inlog2bpp == 4)
+    {
+      AND(R(r_inshift), R(r_inshift), S | IMM(1),              "ANDS    r_inshift,r_inshift,#1          ; halfword offset (0 or 1)");
+      MOV(R(r_inshift), EQ | IMM(2),                           "MOVEQ   r_inshift,#2                    ; halfword offset (1 or 2)");
+      MOV(R(r_inshift), OP2R(R(r_inshift)) | LSLI(4),          "MOV     r_inshift,r_inshift,LSL #4      ; 16/32 bit offset");
+    }
+    else /* wp->save_inlog2bpp == 3 */
+    {
+      AND(R(r_inshift), R(r_inshift), S | IMM(3),              "ANDS    r_inshift,r_inshift,#3          ; byte offset as 0/1/2/3");
+      RSB(R(r_inshift), R(r_inshift), IMM(4),                  "RSB     r_inshift,r_inshift,#4          ; byte offset as 4/3/2/1");
+      MOV(R(r_inshift), OP2R(R(r_inshift)) | LSLI(3),          "MOV     r_inshift,r_inshift,LSL #3      ; 8/16/24/32 bit offset");
+    }
+  }
+  else
+  {
+    LDR_WP_C(r_inshift, save_inshift, "input initial shift")
+    RSB(R(r_inshift), R(r_inshift), IMM(32),                 "RSB     r_inshift,r_inshift,#32         ; pixels of first word to transfer, in 1..32");
+  }
+static void fetch_pixel_init(asm_workspace *wp, workspace *ws)
+/* Initialise whatever registers are needed for fetching and translating
+ * pixels.
+ */
+  /* The input word pointer */
+  if (PLOTMASK)
+  {
+    LDR_WP_C(r_inptr, save_ecflimit, "base of ECF pattern")
+  }
+  else if (wp->is_it_jpeg)
+  {
+    LDR_WP_C(r_inptr, in_y, "initial y coordinate (for JPEG data)")
+  }
+  else /* normal data source for PutSpriteScaled */
+  {
+    LDR_WP_C(r_inptr, save_inptr, "input word pointer")
+  }
+  /* all other registers re fetching input data */
+  if (x_block_move(wp, ws))
+  {
+    /* Prepare for machine code core to inner loop */
+#if 0
+    LDR_WP_C(r_inshift, save_inshift, "input initial shift (for block move)")
+    RSB(R(r_inshift), R(r_inshift), IMM(32),                 "RSB     r_inshift,r_inshift,#32         ; pixels of first word to transfer, in 1..32");
+    get_in_shift(wp, ws);
+    LDR_WP(r_blockroutine, ccompiler_bitblockmove)
+  }
+  else
+  {
+    /* initialise r_inptr */
+    if (PLOTMASK)
+    {
+      LDR_WP(r_ecfindex, save_ecfptr) /* byte index into ECF pattern, not rounded */
+      AND(R(r_pixel), R(r_ecfindex), IMM(0x18),              "AND     r_pixel,r_ecfindex,#&18         ; extract initial row offset in ECF");
+      ADD(R(r_inptr), R(r_inptr), OP2R(R(r_pixel)),          "ADD     r_inptr,r_inptr,r_pixel         ; and add to initial ECF row address");
+      LDR_WP(r_bgcolour, bgcolour) /* background colour pixel */
+    }
+    else
+    {
+      /* r_inword and r_inshift */
+      if (!SOURCE_32_BIT) /* if not 32-bit source */
+      {
+        /* r_inword not initialised yet, done in inner loop */
+#if 0
+        LDR_WP(r_inshift, save_inshift)
+        RSB(R(r_inshift), R(r_inshift), IMM(32),             "RSB     r_inshift,r_inshift,#32         ; pixels still to shift");
+        get_in_shift(wp, ws);
+        MOV(R(r_inshift), OP2R(R(r_inshift)) | LSLI(27),     "MOV     r_inshift,r_inshift,LSL #27     ; keep up at top end of register");
+      }
+    }
+    /* mask registers */
+    if (SOURCE_MASK)
+    {
+      {
+        LDR_WP(r_maskinshift, save_maskinshift)
+        if (SOURCE_BPPMASK)
+        {
+          LDR_WP(r_maskinptr, save_maskinptr)
+        }
+        else /* PLOTMASK and not BPPMASK */
+        {
+          LDR_WP_C(r_maskinptr, save_inptr, "mask pointer for PlotMaskScaled")
+          LDR_WP(r_pixel, save_masko) /* temp use of r_pixel */
+          ADD(R(r_maskinptr), R(r_maskinptr), OP2R(R(r_pixel)),"ADD     r_maskinptr,r_maskinptr,r_pixel ; mask pointer (for PlotMask)");
+        }
+        RSB(R(r_maskinshift), R(r_maskinshift), IMM(32),   "RSB     r_maskinshift,r_maskinshift,#32 ; pixels still to shift");
+        MOV(R(r_maskinshift),
+            OP2R(R(r_maskinshift)) | LSLI(27),             "MOV     r_maskinshift,r_maskinshift,LSL #27 ; keep up at top end of register");
+      }
+      else
+        LDR_WP(r_masko, save_masko)
+    }
+    /* translation registers */
+    if (wp->cal_table) LDR_WP(r_table, cal_table)
+    else if (wp->trns_palette != 0) LDR_WP(r_table, trns_palette)
+    else
+    {
+      if (wp->ColourTTR != 0)
+      {
+        LDR_WP(r_table, ColourTTR)
+        if (  wp->BPP <= 8                  /* 256 colours or less on output */
+           && wp->save_inlog2bpp >= 4       /* thousands or millions of input colours */
+           )
+        {
+          ins(ws, LDR(R(r_table), R(r_table)) | OFFSET(4), "LDR     r_table,[r_table,#4]            ; load base of 32K table");
+        }
+      }
+    }
+    if (wp->save_inlog2bpp <= 3 && simple_x_scale(wp, ws))
+      MOV(R(r_in_pixmask), IMM(ws->in_pixmask),           "MOV     r_in_pixmask,#in_pixmask        ; for use in 2-at-a-time loop");
+    /* temp1 and temp2 need no initialisation. */
+    if (ws->regnames.r_c1632.regno != -1) /* Generate binary constant 0000000011100000 1110000011100000 */
+    {
+      MOV(R(r_c1632), IMM(0xe0),                          "MOV     r_c1632,#&e0                    ; 0000000000000000 0000000011100000");
+      ORR(R(r_c1632), R(r_c1632),OP2R(R(r_c1632))|LSLI(8),"ORR     r_c1632,r_c1632,r_c1632,LSL #8  ; 0000000000000000 1110000011100000");
+      ORR(R(r_c1632), R(r_c1632),OP2R(R(r_c1632))|LSLI(8),"ORR     r_c1632,r_c1632,r_c1632,LSL #8  ; 0000000011100000 1110000011100000");
+    }
+    if (ws->odither)
+    {
+      /* We use ordered dither to attempt to increase the output resolution by almost two bits.
+       * This only happens for a 16bpp or 32bpp source that's being truncated somewhat.
+       * A square of output pixels has the following binary addition values:
+       *              11    01
+       *              00    10
+       * These values are added to the value of each or R/G/B, just before those values are
+       * truncated or looked up in a table, shifted so that we add to the bits which are
+       * just about to be discarded.
+       * We keep the value to add in r_oditheradd.
+       * To proceed along the x axis we EOR by 10 every output pixel.
+       * We must also EOR by 01 every line.
+       * The starting value must be aligned with the origin of the output.
+       */
+      comment(ws, "Compute initial dither addition value - bit 0 changes every y, bit 1 every x");
+      LDR_WP(r_pixel, save_xcoord)
+      AND(R(r_pixel), R(r_pixel), IMM(1),                        "AND     r_pixel,r_pixel,#1               ; least sig bit of x, for dither");
+      LDR_WP(r_oditheradd, save_ycoord)
+      AND(R(r_oditheradd), R(r_oditheradd), IMM(1),              "AND     r_oditheradd,r_oditheradd,#1     ; least sig bit of y, for dither");
+      EOR(R(r_pixel),R(r_pixel),OP2R(R(r_oditheradd)),           "EOR     r_pixel,r_pixel,r_oditheradd     ; if we start Y off on an odd footing, invert x as well");
+      ORR(R(r_oditheradd), R(r_oditheradd),
+      OP2R(R(r_pixel)) | LSLI(1),                                "ORR     r_oditheradd,r_oditheradd,r_pixel,LSL #1 ; dither add value");
+      /* The dither should start based on the current ECF offset */
+      MOV(R(r_pixel),IMM(0x10) | IMMROR(24),                     "MOV     r_pixel,#&1000                   ; prepare to get ECFYOffset");
+      LDR_INDEX(r_pixel,r_pixel,0x1FC,"get kernel variable ECFYOffset from &11FC")
+      TST(R(r_pixel),IMM(1),                                     "TST     r_pixel,#1                       ; is Y ECF offset odd?");
+      EOR(R(r_oditheradd),R(r_oditheradd),NE | IMM(3),           "EORNE   r_oditheradd,r_oditheradd,#3     ; if so, change ordered dither origin to match");
+      MOV(R(r_pixel),IMM(0x10) | IMMROR(24),                     "MOV     r_pixel,#&1000                   ; prepare to get ECFShift");
+      LDR_INDEX(r_pixel,r_pixel,0x1F8,"get kernel variable ECFShift from &11F8")
+      TST(R(r_pixel),IMM(wp->BPP),                               "TST     r_pixel,#out_bpp                 ; is ECF Shift an odd number of pixels?");
+      EOR(R(r_oditheradd),R(r_oditheradd),NE | IMM(2),           "EORNE   r_oditheradd,r_oditheradd,#2     ; if so, change ordered dither origin to match");
+      /* Shift the dither value to the top of the register. */
+      {
+        IFDEBUG(char a[256];)
+        IFDEBUG(do_sprintf(a, "MOV     r_oditheradd,r_oditheradd,LSL #%i %t40; shift to top of word", 23 + ws->odither);)
+        MOV(R(r_oditheradd), OP2R(R(r_oditheradd)) | LSLI(23 + ws->odither), a);
+      }
+    }
+  }
+  newline();
+static void save_pixel_init(asm_workspace *wp, workspace *ws)
+/* Initialise whatever registers are needed for saving the new pixel
+ * into the current destination pixel.
+ */
+  LDR_WP(r_outptr, save_outptr)
+  if (x_block_move(wp, ws))
+  {
+    /* Very simple inner loop */
+    LDR_WP_C(r_pixel, save_xcoord, "get initial output x coord in pixels") /* Measured in pixels */
+    AND(R(r_outshift), R(r_pixel), IMM(ws->out_ppw-1),            "AND     r_outshift,r_pixel,#out_ppw-1   ; pix offset of start");
+    MOV(R(r_outshift),OP2R(R(r_outshift)) | LSLI(ws->out_l2bpc),  "MOV     r_outshift,r_outshift,LSL #out_l2bpc ; bit offset of start, in 0..31");
+    RSB(R(r_outshift), R(r_outshift), IMM(32),                    "RSB     r_outshift,r_outshift,#32       ; pixels of space, in 1..32");
+  }
+  else
+  {
+    /* Normal cases */
+    if (PLOTMASK || !DEST_32_BIT)
+      LDR_WP_C(r_pixel, save_xcoord, "output x coord measured in pixels")
+    if (PLOTMASK)
+    {
+      MOV(R(r_ecfindex), OP2R(IMM(0)),                          "MOV     r_ecfindex, #0               ; should always be 0 ?");
+#if 0
+      AND(R(r_ecfindex), R(r_pixel), IMM(ws->out_ppw),          "AND     r_ecfindex,r_pixel,#out_ppw  ; pixels into ECF pattern");
+      /* Convert from pixels, to byte offset into ECF line - either 0 or 4 */
+      if (ws->out_l2ppw > 2) /* > 4 output pixels per word */
+        MOV(R(r_ecfindex), OP2R(R(r_ecfindex))
+            | LSRI(ws->out_l2ppw - 2),                          "MOV     r_ecfindex,r_ecfindex,LSR #out_l2ppw-2 ; convert to byte offset");
+      if (ws->out_l2ppw < 2) /* < 4 output pixels per word (ie 2 or 1) */
+        MOV(R(r_ecfindex), OP2R(R(r_ecfindex))
+            | LSLI(2 - ws->out_l2ppw),                          "MOV     r_ecfindex,r_ecfindex,LSL #2-out_l2ppw ; convert to byte offset");
+    }
+    if (!DEST_32_BIT)
+    {
+      AND(R(r_outshift), R(r_pixel), IMM(ws->out_ppw-1),          "AND     r_outshift,r_pixel,#out_ppw-1 ; pixel offset of start");
+      MOV(R(r_outshift),OP2R(R(r_outshift)) | LSLI(ws->out_l2bpc),"MOV     r_outshift,r_outshift,LSL #out_l2bpc ; bit offset of start");
+      RSB(R(r_outshift), R(r_outshift), IMM(32),                  "RSB     r_outshift,r_outshift,#32       ; pixels still to rotate");
+      MOV(R(r_outshift), OP2R(R(r_outshift)) | LSLI(27),          "MOV     r_outshift,r_outshift,LSL #27   ; up at the top");
+    }
+  }
+static void xloop_init(asm_workspace *wp, workspace *ws)
+/* Initialise whatever registers are needed for control of
+ * horizontal scaling. For some simple cases no scaling registers
+ * are needed.
+ */
+  LDR_WP(r_xsize, save_xsize)
+  if (!simple_x_scale(wp, ws)) /* not 1:1 scale */
+  {
+    if ((ws->odither) && (SOURCE_16_BIT))
+    {
+      LDR_WP(r_pixel, save_xcount); /* Changed by (GPS) to fix register spill bug*/
+    }
+    else
+    {
+      LDR_WP(r_xcount, save_xcount);
+    }
+  }
+  if (x_block_move(wp, ws))
+    MOV(R(r_xsize), OP2R(R(r_xsize)) | LSLI(ws->out_l2bpc),       "MOV     r_xsize,r_xsize,LSL #out_l2bpc  ; size in bits");
+static void yloop_init(asm_workspace *wp, workspace *ws)
+/* Initialise whatever registers are needed for control of
+ * the vertical loop. These registers are part of a separate 'bank'
+ * from those in the central loop.
+ */
+  if (wp->is_it_jpeg) LDR_WP_C(r_fetchroutine, fetchroutine, "routine to call to get JPEG data line")
+  LDR_WP(r_ysize, save_ysize)
+  if (!simple_y_scale(wp, ws)) /* not 1:1 scale */ LDR_WP(r_ycount, save_ycount)
+  if (!PLOTMASK)
+  {
+    if (wp->is_it_jpeg)
+      /* We could save this register, but there's not all that much point - simpler to code like this. */
+      MOV(R(r_inoffset),IMM(1),                                   "MOV     r_inoffset,#1                   ; JPEG coord offset on input");
+    else
+      LDR_WP(r_inoffset, save_inoffset)
+  }
+  if (SOURCE_BPPMASK) LDR_WP(r_maskinoffset, save_maskinoffset)
+  else if (PLOTMASK) LDR_WP(r_maskinoffset, save_inoffset)
+*                                                                         *
+*    Bitblit: Pixel loading, translation, saving.                         *
+*                                                                         *
+static void fetch_pixel_unmasked(asm_workspace *wp, workspace *ws)
+/* Assuming no mask, get the next input pixel and put it in r_pixel. This is separated
+ * from fetch_pixel for the case of scaling up an ordered dither, where the same input
+ * pixel is repeatedly fetched and translated.
+ */
+  if (PLOTMASK)
+  {
+    comment(ws, "Fetch an ECF pixel");
+    if (DEST_32_BIT)
+    {
+      ins(ws, LDR(R(r_inword), R(r_inptr))
+             | INDEX(R(r_ecfindex), 0),                      "LDR     r_inword,[r_inptr,r_ecfindex] 2222");
+      ADD(R(r_ecfindex), R(r_ecfindex),
+            IMM(4),                                          "ADD     r_ecfindex,r_ecfindex,#4  5t453");
+      ins(ws, LDR(R(r_bgcolour), R(r_inptr))
+            | INDEX(R(r_ecfindex), 0),                       "LDR     r_bgcolour,[r_inptr,r_ecfindex]   ; load next EOR word of ECF222");
+      SUB(R(r_ecfindex), R(r_ecfindex),
+            IMM(4),                                          "SUB     r_ecfindex,r_ecfindex,#4 1212");
+    }
+    else
+    {
+      if (DEST_16_BIT)
+      {
+          MOV(R(r_pixel), OP2R(R(r_inword)) | LSLI(16),     "MOV     r_pixel,r_inword,LSL #16        ; fetch 16 bit ECF pattern pixel");
+          MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(16),      "MOV     r_pixel,r_pixel,LSR #16");
+      }
+      else
+      {
+//        AND(R(r_pixel), R(r_inword), IMM(ws->out_pixmask),"AND     r_pixel,r_inword,#out_pixmask    ; fetch the pixel from the ECF pattern");
+//        AND(R(r_pixel), R(r_pixel), OP2R(R(r_bgcolour)),  "AND     r_pixel,r_pixel,r_bgcolour       ; turn it into a background colour pixel");
+      }
+    }
+  }
+  else
+  {
+    comment(ws, "Fetch a source pixel");
+    if (SOURCE_32_BIT)
+      ins(ws, LDR(R(r_pixel), R(r_inptr)) | OFFSET(0),    "LDR     r_pixel,[r_inptr]");
+    else if (SOURCE_16_BIT)
+    {
+      MOV(R(r_pixel), OP2R(R(r_inword)) | LSLI(16),     "MOV     r_pixel,r_inword,LSL #16        ; fetch 16 bit pixel");
+      MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(16),      "MOV     r_pixel,r_pixel,LSR #16");
+      /* >>> Maybe we can leave it in the top 16 bits, and get by? Not yet. */
+    }
+    else
+    {
+      AND(R(r_pixel), R(r_inword), IMM(ws->in_pixmask), "AND     r_pixel,r_inword,#in_pixmask    ; fetch the pixel");
+    }
+  }
+static BOOL fetch_pixel(asm_workspace *wp, workspace *ws, label *l_masked)
+/* Check the mask, fetch the current pixel. If the current pixel is
+ * transparent then branch out to l_masked. Return TRUE if the branch could be
+ * taken, else FALSE.
+ */
+  IFDEBUG(char a[256];)
+  {
+    TST(R(r_maskinword), IMM(1),                      "TST     r_maskinword,#1");
+    IFDEBUG(do_sprintf(a,                                "BEQ     %s", l_masked->name);)
+    branch(ws, B | EQ, l_masked, a);
+  }
+  fetch_pixel_unmasked(wp, ws);
+  return SOURCE_MASK;
+static BOOL fetch_pixel2(asm_workspace *wp, workspace *ws, label *l_masked)
+/* Check the mask, fetch the pixel after the current one. You are assured
+ * that no word of input need be loaded between these two. If the pixel is
+ * transparent then branch out to l_masked. Return TRUE if the branch could be
+ * taken, else FALSE.
+ */
+#ifdef DEBUG
+  char a[256];
+  assert(!PLOTMASK, ERROR_FATAL); /* Doesn't do 2-at-a-time loop */
+  if (SOURCE_MASK) /* Test the second pixel of mask */
+  {
+    if (SOURCE_BPPMASK) /* we may have reached the end of mask word if not doing an aligned plot */
+    {
+      MOV(R(r_maskinword), OP2R(R(r_maskinword))
+                       | RORI(ws->mask_bpp),                 "x"/*MOV     r_maskinword,r_maskinword,ROR #mask_bpp"*/);
+      SUB(R(r_maskinshift),R(r_maskinshift),
+                       S | IMM(ws->mask_bpp*2) | IMMROR(6),  "x"/*SUBS    r_maskinshift,r_maskinshift,#mask_bpp:SHL:27"*/);
+      ins(ws, LDR(R(r_maskinword), R(r_maskinptr))
+          | EQ | WRITEBACK | OFFSET(4),                     "x"/* "LDREQ   r_maskinword,[r_maskinptr,#4]!     ; load more mask pixels (inc2)"*/);
+      TST(R(r_maskinword), IMM(1),                           "TST     r_maskinword,#1");
+    }
+    else
+    {
+      TST(R(r_maskinword),
+          ws->mask_bpc < 8
+            ? IMM(1 << ws->mask_bpc)
+            : IMM(1) | IMMROR(32 - ws->mask_bpc),       "TST     r_maskinword,#1:SHL:mask_bpc");
+      IFDEBUG(do_sprintf(a,                                "BEQ     %s", l_masked->name);)
+    }
+    branch(ws, B | EQ, l_masked, a);
+  }
+  comment(ws, "Fetch the source pixel after the current one");
+  if (SOURCE_32_BIT)
+    ins(ws, LDR(R(r_pixel), R(r_inptr)) | OFFSET(4),  "LDR     r_pixel,[r_inptr,#4]");
+  else if (SOURCE_16_BIT)
+  {
+    MOV(R(r_pixel), OP2R(R(r_inword)) | LSRI(16),     "MOV     r_pixel,r_inword,LSR #16");
+    /* >>> Getting it into top 16bits harder in this case! */
+  }
+  else
+    AND(R(r_pixel), R(r_in_pixmask),
+        OP2R(R(r_inword)) | LSRI(ws->in_bpc),         "AND     r_pixel,r_in_pixmask,r_inword,LSR #in_bpc"
+                                                      " ; fetch the next pixel");
+  return SOURCE_MASK;
+#ifdef DEBUG
+static void add_ordered_dither_gun(asm_workspace *wp, workspace *ws, int bits_per_gun, int offset, char *gun)
+#define add_ordered_dither_gun(a,b,c,d,e) do_add_ordered_dither_gun(a,b,c,d)
+static void do_add_ordered_dither_gun(asm_workspace *wp, workspace *ws, int bits_per_gun, int offset)
+/* Do one gun of the ordered dither - entirely local to add_ordered_dither below
+ * Offset is the offset from bit 0 of the base of this field of the colour
+ */
+  int x = 32 - bits_per_gun - offset; /* amount to shift the colour field in question */
+#ifdef DEBUG
+  char a[256];
+  IFDEBUG(do_sprintf(a,                                  "CMN     r_oditheradd,r_pixel,LSL #%i %t40; will the %s value overflow?", x, gun);)
+  CMN(R(r_oditheradd), OP2R(R(r_pixel)) | LSLI(x), a);
+  IFDEBUG(do_sprintf(a,                                  "ADDCC   r_pixel,r_pixel,r_oditheradd,LSR #%i %t40; if not, add.", x);)
+  ADD(R(r_pixel), R(r_pixel), CC | OP2R(R(r_oditheradd)) | LSRI(x), a);
+  UNUSED(wp);
+static void add_ordered_dither(asm_workspace *wp, workspace *ws, int bits_per_gun)
+/* bits_per_gun is 5 or 8. The 32-bit RGB value in r_pixel should have
+ * r_oditheradd >> (32-bits_per_gun) added to each of R/G/B, except that these
+ * additions should be 'sticky' at 255 in each gun.
+ * 
+ * The resulting values are just about to be truncated somewhat, so the lo
+ * bits of each answer do not matter much. Thus, if the value is currently
+ * 254 we never add, but this doesn't matter.
+ */
+  if (ws->odither) /* turn off for now */
+  {
+    comment(ws, "Add current value for ordered dither");
+    add_ordered_dither_gun(wp, ws, bits_per_gun, 2*bits_per_gun, "blue");
+    add_ordered_dither_gun(wp, ws, bits_per_gun, 1*bits_per_gun, "green");
+    add_ordered_dither_gun(wp, ws, bits_per_gun, 0, "red");
+    newline();
+  }
+static void translate_pixel(asm_workspace *wp, workspace *ws)
+/* Translate r_pixel from being a source pixel, to being a destination pixel. */
+  int pixl2bpp = wp->save_inlog2bpp;
+  if (PLOTMASK)
+  {
+    if ((ws->gcol & 7) == 2) /* AND plot action */
+    {
+      MOV(R(r_pixel), OP2R(R(r_pixel)) | LSLI(31-(wp->BPP)),  "MOV     r_pixel, r_pixel, LSL 31-out_bpp ;a");
+      ORR(R(r_pixel), R(r_pixel), IMM(2) | IMMROR(2),         "ORR     r_pixel,r_pixel,#&80000000       ;a");
+      MOV(R(r_pixel), OP2R(R(r_pixel)) | ASRI(31-(wp->BPP)),  "MOV     r_pixel, r_pixel, ASR 31-out_bpp ;a");
+    }
+    return; /* No more transformation necessary */
+  }
+  if (ws->odither) add_ordered_dither(wp, ws, pixl2bpp == 5 ? 8 : 5); /* do ordered dither */
+  comment(ws, "Perform any0 transformation necessary");
+  if (wp->trns_palette != 0)
+  {
+    assert(pixl2bpp <= 3, ERROR_FATAL);
+    if (wp->BPP == 16)
+    {
+      ins(ws, LDR(R(r_pixel), R(r_table))
+            | INDEX(R(r_pixel), 2),                     "LDR     r_pixel,[r_table, r_pixel, LSL #2] ; 16bpp palette lookup");
+      pixl2bpp = 4;
+    }
+    else
+    {
+      ins(ws, LDR(R(r_pixel), R(r_table))
+            | INDEX(R(r_pixel), 3),                     "LDR     r_pixel,[r_table, r_pixel, LSL #3] ; palette lookup");
+      MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(8),       "MOV     r_pixel,r_pixel,LSR #8             ; BBGGRR00 to 00BBGGRR ");
+      pixl2bpp = 5;
+    }
+  }
+  if (pixl2bpp == 5 && wp->BPP != 32 && !(wp->is_it_jpeg && (wp->dither_truecolour & 2) && (wp->BPP != 16))) /* all we can do is truncate to 4, as a first stage. */
+  {
+    /* if (wp->BPP == 16 && ws->odither) add_ordered_dither(wp, ws, 8); */
+    comment(ws,                                   " Taken from munge32to16                                      fedcba9876543210 fedcba9876543210");
+    comment(ws,                                   "                                                   r_pixel = 00000000bbbbbbbb ggggggggrrrrrrrr");
+    AND(R(r_temp1),R(r_pixel),IMM(0xf8) | IMMROR(16),   "AND     r_temp1,r_pixel,#&F80000           ; r_temp1 = 00000000bbbbb000 0000000000000000");
+    MOV(R(r_temp2), OP2R(R(r_temp1)) | LSLI(7),         "MOV     r_temp2,r_temp1,LSL #7             ; r_temp2 = 0bbbbb0000000000 0000000000000000");
+    AND(R(r_temp1), R(r_pixel), IMM(0xf8)|IMMROR(24),   "AND     r_temp1,r_pixel,#&f800             ; r_temp1 = 0000000000000000 ggggg00000000000");
+    ORR(R(r_temp2),R(r_temp2),OP2R(R(r_temp1))|LSLI(10),"ORR     r_temp2,r_temp2,r_temp1,LSL #10    ; r_temp2 = 0bbbbbggggg00000");
+    AND(R(r_temp1), R(r_pixel), IMM(0xf8),              "AND     r_temp1,r_pixel,#&F8               ; r_temp1 = 0000000000000000 00000000rrrrr000");
+    ORR(R(r_pixel),R(r_temp2),OP2R(R(r_temp1))|LSLI(13),"ORR     r_pixel,r_temp2,r_temp1,LSL #13    ; r_pixel = 0bbbbbgggggrrrrr");
+    MOV(R(r_pixel),OP2R(R(r_pixel)) | LSRI(16),          "MOV     r_pixel,r_pixel,LSR #16            ; result in bottom half");
+    /* >>> check re keeping 16bit r_pixel at the top */
+    pixl2bpp = 4;
+  }
+  if (pixl2bpp == 4 && wp->BPP == 32) /* pad out to 32bpp */
+  {
+    MOV(R(r_pixel),OP2R(R(r_pixel)) | LSLI(16),          "MOV     r_pixel,r_pixel,LSL #16           ; input in top half");
+    /* >>> check re keeping 16bit r_pixel at the top */
+    comment(ws,                                    "                                                ;           fedcba9876543210 fedcba9876543210");
+    comment(ws,                                    "                                                ; r_pixel = 0bbbbbgggggrrrrr");
+    MOV(R(r_temp1), OP2R(R(r_pixel)) | LSRI(26),         "MOV     r_temp1,r_pixel,LSR #26           ; r_temp1 =                            0bbbbb");
+    MOV(R(r_temp2), OP2R(R(r_temp1)) | LSLI(19),         "MOV     r_temp2,r_temp1,LSL #19           ; r_temp2 =        0bbbbb000 0000000000000000");
+    AND(R(r_temp1), R(r_pixel), IMM(0x3E) | IMMROR(12),  "AND     r_temp1,r_pixel,#&03E00000        ; r_temp1 = 000000ggggg00000");
+    ORR(R(r_temp2), R(r_temp2),OP2R(R(r_temp1))|LSRI(10),"ORR     r_temp2,r_temp2,r_temp1,LSR #10   ; r_temp2 =        0bbbbb000 ggggg00000000000");
+    MOV(R(r_temp1), OP2R(R(r_pixel)) | LSLI(11),         "MOV     r_temp1,r_pixel,LSL #11           ; r_temp1 = rrrrr00000000000 0000000000000000");
+    ORR(R(r_temp2), R(r_temp2),OP2R(R(r_temp1))|LSRI(24),"ORR     r_temp2,r_temp2,r_temp1,LSR #24   ; r_temp2 =        0bbbbb000 ggggg000rrrrr000");
+    comment(ws, "Now copy the top three bits of each colour component into the bottom three");
+    comment(ws,                                    "                                                ; r_c1632 = 0000000011100000 1110000011100000");
+    AND(R(r_temp1), R(r_temp2), OP2R(R(r_c1632)),        "AND     r_temp1,r_temp2,r_c1632           ; r_temp1 = 00000000bbb00000 ggg00000rrr00000");
+    ORR(R(r_pixel), R(r_temp2),OP2R(R(r_temp1))|LSRI(5), "ORR     r_pixel,r_temp2,r_temp1,LSR #5    ; r_pixel = 00000000bbbbbbbb ggggggggrrrrrrrr");
+    pixl2bpp = 5;
+  }
+  /* Translation table lookup */
+  if (wp->ColourTTR != 0)
+  {
+    comment(ws, "We have a translation table.");
+    if (ws->out_l2bpp <= 3) /* ie BPP <= 8 */
+    {
+      assert(pixl2bpp <= 4, ERROR_FATAL); /* up to 32K entries in byte table */
+      /* if (pixl2bpp == 4 && ws->odither) add_ordered_dither(wp, ws, 5); */
+      ins(ws, LDRB(R(r_pixel), R(r_table)) | INDEX(R(r_pixel), 0),  "LDRB    r_pixel,[r_table, r_pixel]      ; byte table lookup");
+    }
+    else
+    {
+      assert(pixl2bpp <= 3, ERROR_FATAL); /* up to 256 entries in word table */
+      ins(ws, LDR(R(r_pixel), R(r_table)) | INDEX(R(r_pixel), 2),   "LDR     r_pixel,[r_table, r_pixel, LSL #2] ; word table lookup");
+      /* >>> with 16bpp that could be in the top half? Not sure... */
+    }
+    pixl2bpp = ws->out_l2bpp;              /* we've finished */
+  }
+  else if (wp->is_it_jpeg && (wp->dither_truecolour & 2))
+  {
+    /* bottom n bits of word contains colour number we want... */
+    pixl2bpp = ws->out_l2bpp;              /* we've finished */
+    comment(ws, "JPEG error diffusion should have done all the work!");
+  }
+  else if (pixl2bpp == 4 && ws->out_l2bpp < 4)
+  {
+    /* Hack for JPEG data in RISC OS 3
+     * r_pixel is a 16bpp colour value at the moment, but we have no lookup table for the 16->1/2/4/8 transition
+     * For 1/2/4bpp we use the top bits of red as the grey level. From a JPEG source this will work
+     * fine, as the JPEG will have noticed that the output is mono and simply produced greyscale
+     * output.
+     */
+    comment(ws, "Colour truncation without lookup table.\n");
+    if (ws->out_l2bpp == 0) /* 1bpp */
+    {
+      comment(ws, "Creating 0 or 1 from 0bbbbbgg gggrrrrr");
+      TST(R(r_pixel), IMM(16),                                      "TST     r_pixel,#16                     ; test hi bit of R");
+      MOV(R(r_pixel), IMM(1),                                       "MOV     r_pixel,#1                      ; black");
+      MOV(R(r_pixel), NE | IMM(0),                                  "MOVNE   r_pixel,#0                      ; white");
+      pixl2bpp = 0;
+    }
+    else if (ws->out_l2bpp == 1) /* 2bpp */
+    {
+      comment(ws, "Creating 0,1,2 or 3 from 0bbbbbgg gggrrrrr");
+      MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(3),                   "MOV     r_pixel,r_pixel,LSR #3           ; hi 2 bits of R");
+      AND(R(r_pixel), R(r_pixel), IMM(3),                           "AND     r_pixel,r_pixel,#3               ; mask off everything else");
+      RSB(R(r_pixel), R(r_pixel), IMM(3),                           "RSB     r_pixel,r_pixel,#3               ; change to 0->white, 3->black");
+      pixl2bpp = 1;
+    }
+    if (ws->out_l2bpp == 2) /* 4bpp */
+    {
+      comment(ws, "Creating wimp colour in 0..7 from 0bbbbbgg gggrrrrr");
+      MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(2),                   "MOV     r_pixel,r_pixel,LSR #2           ; hi 3 bits of R");
+      AND(R(r_pixel), R(r_pixel), IMM(7),                           "AND     r_pixel,r_pixel,#7               ; mask off everything else");
+      RSB(R(r_pixel), R(r_pixel), IMM(7),                           "RSB     r_pixel,r_pixel,#7               ; change to 0->white, 7->black");
+      pixl2bpp = 2;
+    }
+    else if (ws->out_l2bpp == 3) /* 8bpp from 16bpp true colour, no lookup table - only for use on RISC OS 3.10 */
+    {
+      /* Get the top two bits of each gun. The organisation is:
+       * bit 0 - tint 0
+       * bit 1 - tint 1
+       * bit 2 - red 2
+       * bit 3 - blue 2
+       * bit 4 - red 3 (high)
+       * bit 5 - green 2
+       * bit 6 - green 3 (high)
+       * bit 7 - blue 3 (high)
+       */
+      comment(ws, "Creating bggrbrtt from 0bbbbbgg gggrrrrr");
+      /* Making the tint - the average of the lo 3 bits of RGB isn't a bad approximation. We make this
+       * by adding them all up, multiplying by 3, and dividing by 8. We involve the lo bits in the approximation
+       * as well, in case they produce a useful carry.
+       */
+      AND(R(r_temp1), R(r_pixel), IMM(0x1C) | IMMROR(24),           "AND     r_temp1,r_pixel,#&1C00           ; bottom 3 bits of B");
+      MOV(R(r_temp2), OP2R(R(r_temp1)) | LSRI(10),                  "MOV     r_temp2,r_temp1,LSR #10          ; at bottom of temp2");
+      AND(R(r_temp1), R(r_pixel), IMM(0xE0),                        "AND     r_temp1,r_pixel,#&E0             ; bottom 3 bits of G");
+      ADD(R(r_temp2), R(r_temp2), OP2R(R(r_temp1)) | LSRI(5),       "ADD     r_temp2,r_temp2,r_temp1,LSR #5   ; add to bottom B bits");
+      AND(R(r_temp1), R(r_pixel), IMM(0x07),                        "AND     r_temp1,r_pixel,#&07             ; bottom 3 bits of R");
+      ADD(R(r_temp2), R(r_temp2), OP2R(R(r_temp1)),                 "ADD     r_temp2,r_temp2,r_temp1          ; add to bottom B+G bits");
+      ADD(R(r_temp2), R(r_temp2), OP2R(R(r_temp2)) | LSLI(2),       "ADD     r_temp2,r_temp2,r_temp2,LSL #2   ; (lo R+G+B)*5    (< 128)");
+      /* The hi bits are just done by extracting from the 16bpp value. This takes ages! */
+      MOV(R(r_temp1), IMM(0),                                       "MOV     r_temp1,#0                       ; building result pixel for hi bits");
+      /* Top bits of B */
+      TST(R(r_pixel), IMM(64) | IMMROR(24),                         "TST     r_pixel,#&4000                   ; test top bit of B");
+      ORR(R(r_temp1), R(r_temp1), NE | IMM(128),                    "ORRNE   r_temp1,r_temp1,#128             ; bit 7 = top bit of B");
+      TST(R(r_pixel), IMM(32) | IMMROR(24),                         "TST     r_pixel,#&2000                   ; test next bit of B");
+      ORR(R(r_temp1), R(r_temp1), NE | IMM(8),                      "ORRNE   r_temp1,r_temp1,#8               ; bit 3 = next bit of B");
+      /* Top bits of G */
+      TST(R(r_pixel), IMM(2) | IMMROR(24),                          "TST     r_pixel,#&200                    ; test top bit of G");
+      ORR(R(r_temp1), R(r_temp1), NE | IMM(64),                     "ORRNE   r_temp1,r_temp1,#64              ; bit 6 = top bit of G");
+      TST(R(r_pixel), IMM(1) | IMMROR(24),                          "TST     r_pixel,#&100                    ; test next bit of G");
+      ORR(R(r_temp1), R(r_temp1), NE | IMM(32),                     "ORRNE   r_temp1,r_temp1,#32              ; bit 5 = next bit of G");
+      /* Top bits of R */
+      TST(R(r_pixel), IMM(16),                                      "TST     r_pixel,#&10                     ; test top bit of R");
+      ORR(R(r_temp1), R(r_temp1), NE | IMM(16),                     "ORRNE   r_temp1,r_temp1,#16              ; bit 4 = top bit of R");
+      TST(R(r_pixel), IMM(8),                                       "TST     r_pixel,#&08                     ; test next bit of R");
+      ORR(R(r_temp1), R(r_temp1), NE | IMM(4),                      "ORRNE   r_temp1,r_temp1,#4               ; bit 2 = next bit of R");
+      ORR(R(r_pixel), R(r_temp1), OP2R(R(r_temp2)) | LSRI(5),       "ORR     r_pixel,r_temp1,r_temp2,LSR #5   ; combine hi bits and tint");
+      pixl2bpp = 3;
+    }
+  }
+  assert(pixl2bpp == ws->out_l2bpp, ERROR_FATAL); /* If this hasn't happened, we haven't completed the transformation. */
+  if (((ws->gcol & 7) == 2) && (pixl2bpp != 5)) /* AND plot action which did something stupid for 32bpp (GPS)*/
+  {
+    MOV(R(r_pixel), OP2R(R(r_pixel)) | LSLI(31-(wp->BPP)), "MOV     r_pixel, r_pixel, LSL 31-out_bpp");
+    ORR(R(r_pixel), R(r_pixel), IMM(2) | IMMROR(2),         "ORR     r_pixel,r_pixel,#&80000000 ");
+    MOV(R(r_pixel), OP2R(R(r_pixel)) | ASRI(31-(wp->BPP)),  "MOV     r_pixel, r_pixel, ASR 31-out_bpp");
+  }
+  comment(ws, "r_pixel is now a destination pixel.");
+    ORR(R(r_pixel), R(r_pixel), OP2R(R(r_pixel)) | LSLI(wp->BPP),   "ORR     r_pixel,r_pixel,r_pixel,LSL #out_bpp:SHR:1 ; double pixel output");
+  newline();
+static void save_pixel(asm_workspace *wp, workspace *ws)
+/* Save the new pixel into the current destination pixel. */
+/* Recall GCOL actions:
+ * 0 -> overwrite old pixel
+ * 1 -> OR with old pixel
+ * 2 -> AND with old pixel
+ * 3 -> EOR with old pixel
+ * 4 -> invert old pixel
+ * 5 -> do nothing
+ * 6 -> AND old pixel with NOT of new pixel
+ * 7 -> OR old pixel with NOT of new pixel
+ */
+  comment(ws, "Put the pixel in the output stream.");
+  if (PLOTMASK)
+  {
+    if (DEST_32_BIT)
+    {
+      ins(ws, LDR(R(r_pixel), R(r_outptr)) | OFFSET(0),              "LDR     r_pixel,[r_outptr] ;bkah");
+      ORR(R(r_pixel), R(r_inword), OP2R(R(r_pixel)),                 "ORR     r_pixel,r_inword,r_pixel               ; 1OR gcol action");
+      EOR(R(r_pixel), R(r_bgcolour), OP2R(R(r_pixel)),               "EOR     r_pixel,r_bgcolour,r_pixel            ; 1EOR gcol action");
+      ins(ws, STR(R(r_pixel), R(r_outptr)) | OFFSET(0),              "STR     r_pixel,[r_outptr]                    ;blaq5h");
+    }
+    else
+    {
+      if (DEST_16_BIT)
+      {
+        MOV(R(r_pixel), OP2R(R(r_inword)) | LSLI(16),             "MOV     r_pixel,r_inword,LSL #16        ; fetch 16 bit ECF pattern pixel44 99");
+        MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(16),              "MOV     r_pixel,r_pixel,LSR #16         ; 4444444");
+        ORR(R(r_outword), R(r_outword), OP2R(R(r_pixel)),         "ORR     r_outword,r_outword,r_pixel           ; ECF OR mask44 99");
+        MOV(R(r_pixel), OP2R(R(r_bgcolour)) | LSLI(16),           "MOV     r_pixel,r_bgcolour,LSL #16        ; fetch 16 bit ECF pattern pixel 4499");
+        MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(16),              "MOV     r_pixel,r_pixel,LSR #16           ;449");
+        EOR(R(r_outword), R(r_outword), OP2R(R(r_pixel)),         "EOR     r_outword,r_outword,r_pixel           ; ECF EOR mask 4499");
+      }
+      else
+      {
+        AND(R(r_pixel), R(r_inword), IMM(ws->out_pixmask),       "1AND     r_pixel,r_inword,#out_pixmask  ; blah blah");
+        ORR(R(r_outword), R(r_outword), OP2R(R(r_pixel)),          "ORR     r_outword,r_outword,r_pixel           ; ECF OR mask");
+        AND(R(r_pixel), R(r_bgcolour), IMM(ws->out_pixmask),       "1AND     r_pixel,r_bgcolour,#out_pixmask    jthjg");
+        EOR(R(r_outword), R(r_outword), OP2R(R(r_pixel)),       "EOR     r_outword,r_outword,r_pixel           ; ECF EOR mask");
+      }
+    }
+  }
+  else
+  {
+    if (DEST_32_BIT)
+    {
+      if (ws->gcol != 0) /* Not just a simple store operation */
+      {
+        ins(ws, LDR(R(r_temp1), R(r_outptr)) | OFFSET(0),             "LDR     r_temp1,[r_outptr]");
+        switch(ws->gcol)
+        {
+          case 7: MVN(R(r_pixel), OP2R(R(r_pixel)),                     "MVN     r_pixel,r_pixel                       ; OR with neg action");
+          case 1: ORR(R(r_temp1), R(r_pixel), OP2R(R(r_temp1)),         "ORR     r_temp1,r_pixel,r_temp1               ; OR gcol action"); break;
+          case 6: MVN(R(r_pixel), OP2R(R(r_pixel)),                     "MVN     r_pixel,r_pixel                       ; AND with neg action");
+          case 2: AND(R(r_temp1), R(r_pixel), OP2R(R(r_temp1)),         "AND     r_temp1,r_pixel,r_temp1               ; AND gcol action"); break;
+          case 3: EOR(R(r_temp1), R(r_pixel), OP2R(R(r_temp1)),         "EOR     r_temp1,r_pixel,r_temp1               ; EOR gcol action"); break;
+          case 4: MVN(R(r_temp1), OP2R(R(r_temp1)),                     "MVN     r_temp1,r_temp1                       ; neg gcol action"); break;
+          /* case 5: is a NOP */
+        }
+      ins(ws, STR(R(r_temp1), R(r_outptr)) | OFFSET(0),               "STR     r_temp1,[r_outptr]");
+      if ((ws->gcol == 7) || (ws->gcol == 6)) /* put r_pixel back as we found it */
+        MVN(R(r_pixel), OP2R(R(r_pixel)),                             "1MVN     r_pixel,r_pixel                       ; Put r_pixel back");
+      }
+      else
+      {
+        ins(ws, STR(R(r_pixel), R(r_outptr)) | OFFSET(0),             "STR     r_pixel,[r_outptr]");
+      }
+    }
+    else
+    {
+      if (ws->gcol == 6 || ws->gcol == 7) /* and/or with NOT of incoming pixel */
+      {
+        if (DESTD_16_BIT)
+        {
+          EOR(R(r_pixel), R(r_pixel), IMM(255),                       "1EOR     r_pixel,r_pixel,#0x00ff               ; act with NOT of input pixel");
+          EOR(R(r_pixel), R(r_pixel), IMM(255) | IMMROR(24),          "1EOR     r_pixel,r_pixel,#0xff00");
+        }
+        else
+          EOR(R(r_pixel), R(r_pixel), IMM(ws->out_dpixmask),          "1EOR     r_pixel,r_pixel,#out_dpixmask         ; act with NOT of input pixel");
+      }
+      switch (ws->gcol)
+      {
+        case 0:
+          if (SOURCE_MASK) /* if no mask, the pixels are clear already */
+          {
+            if (DESTD_16_BIT)
+            {
+              BIC(R(r_outword), R(r_outword), IMM(255),                 "BIC     r_outword,r_outword,#0x00ff");
+              BIC(R(r_outword), R(r_outword), IMM(255) | IMMROR(24),    "BIC     r_outword,r_outword,#0xff00");
+            }
+            else
+              BIC(R(r_outword), R(r_outword), IMM(ws->out_dpixmask),    "BIC     r_outword,r_outword,#out_dpixmask");
+          }
+          /* fall through */
+        case 7:
+        case 1: ORR(R(r_outword), R(r_outword), OP2R(R(r_pixel)),       "ORR     r_outword,r_outword,r_pixel           ; gcol action"); break;
+        case 6:
+        case 2: AND(R(r_outword), R(r_outword), OP2R(R(r_pixel)),       "AND     r_outword,r_outword,r_pixel           ; AND gcol action"); break;
+        case 3: EOR(R(r_outword), R(r_outword), OP2R(R(r_pixel)),       "EOR     r_outword,r_outword,r_pixel           ; EOR gcol action"); break;
+        case 4: if (DESTD_16_BIT)
+                {
+                  EOR(R(r_outword), R(r_outword), IMM(255),             "EOR     r_outword,r_outword,#0x00ff           ; negate existing pixel");
+                  EOR(R(r_outword), R(r_outword), IMM(255) | IMMROR(24),"EOR     r_outword,r_outword,#0xff00");
+                }
+                else
+                  EOR(R(r_outword), R(r_outword), IMM(ws->out_dpixmask),"EOR     r_outword,r_outword,#out_dpixmask     ; negate existing pixel");
+                break;
+        case 5: comment(ws, "no GCOL action"); break;
+      }
+      if (ws->gcol == 6 || ws->gcol == 7) /* put r_pixel back as we found it in case scaling > 1:1! */
+      {
+        if (DESTD_16_BIT)
+        {
+          EOR(R(r_pixel), R(r_pixel), IMM(255),                       "EOR     r_pixel,r_pixel,#0x00ff               ; put r_pixel back as it was");
+          EOR(R(r_pixel), R(r_pixel), IMM(255) | IMMROR(24),          "EOR     r_pixel,r_pixel,#0xff00               ; put r_pixel back as it was");
+        }
+        else
+          EOR(R(r_pixel), R(r_pixel), IMM(ws->out_dpixmask),          "EOR     r_pixel,r_pixel,#out_dpixmask         ;  put r_pixel back as it was");
+      }
+    }
+  }
+static void save_pixel_opt(asm_workspace *wp, workspace *ws)
+/* Save pixel for use by optimised >5 scaling code. */
+  if (DESTD_16_BIT)
+  {
+    BIC(R(r_outword), R(r_outword), IMM(255),                 "3BIC     r_outword,r_outword,#0x00ff");
+    BIC(R(r_outword), R(r_outword), IMM(255) | IMMROR(24),    "4BIC     r_outword,r_outword,#0xff00");
+  }
+  else
+  {
+    BIC(R(r_outword), R(r_outword), IMM(ws->out_dpixmask),    "5BIC     r_outword,r_outword,#out_dpixmask");
+  }
+    ORR(R(r_outword), R(r_outword), OP2R(R(r_pixel)),       "6ORR     r_outword,r_outword,r_pixel           ; gcol action");
+static void save_pixel2(asm_workspace *wp, workspace *ws)
+/* Save the new pixel into the pixel after the current destination pixel. */
+  comment(ws, "Put the pixel in the output stream, one after the 'current' pixel.");
+  /* Current limitation */
+  assert(ws->gcol == 0, ERROR_FATAL);
+  if (DEST_32_BIT)
+  {
+    ins(ws, STR(R(r_pixel), R(r_outptr)) | OFFSET(4),         "STR     r_pixel,[r_outptr,#4]");
+  }
+  else
+  {
+    if (SOURCE_MASK)
+    {
+      if (wp->BPC == 16) /* DEST_16_BIT but includes double-pixel 256-colour mode 10 too */
+      {
+        BIC(R(r_outword), R(r_outword), IMM(255) | IMMROR(16),  "BIC     r_outword,r_outword,#0x00ff0000");
+        BIC(R(r_outword), R(r_outword), IMM(255) | IMMROR(8),   "BIC     r_outword,r_outword,#0xff000000");
+      }
+      else
+        BIC(R(r_outword), R(r_outword),
+            wp->BPC == 1
+              ? IMM(2) /* IMMROR arg must be an even number */
+              : IMM(ws->out_dpixmask) | IMMROR(32 - wp->BPC),   "BIC     r_outword,r_outword,#out_dpixmask:SHL:out_bpc");
+    }
+    ORR(R(r_outword),R(r_outword),
+        OP2R(R(r_pixel)) | LSLI(wp->BPC),                       "ORR     r_outword,r_outword,r_pixel,LSL #out_bpc");
+  }
+*                                                                         *
+*    Bitblit: Advancing the current pixel.                                *
+*                                                                         *
+static void fetch_pixel_inc(asm_workspace *wp, workspace *ws)
+/* Increment the pointer to the source pixel */
+  comment(ws, "Advance source pointer");
+  if (!PLOTMASK) /* The ECF pattern remains aligned to the destination */
+  {
+    if (SOURCE_32_BIT)
+    {
+      ADD(R(r_inptr), R(r_inptr), IMM(4),                      "ADD     r_inptr,r_inptr,#4");
+    }
+    else
+    {
+      MOV(R(r_inword), OP2R(R(r_inword)) | RORI(ws->in_bpc),   "MOV     r_inword,r_inword,ROR #in_bpc");
+        MOV(R(r_maskinword), OP2R(R(r_maskinword)) |
+                             RORI(ws->in_bpc),                 "MOV     r_maskinword,r_maskinword,ROR #in_bpc");
+      SUB(R(r_inshift), R(r_inshift),
+          S | IMM(ws->in_bpc*2) | IMMROR(6),                   "SUBS    r_inshift,r_inshift,#in_bpc:SHL:27 ; auto-resets itself to 0");
+      ins(ws, LDR(R(r_inword), R(r_inptr))
+            | EQ | WRITEBACK | OFFSET(4),                      "LDREQ   r_inword,[r_inptr,#4]!");
+    }
+  }
+  {
+    {
+      MOV(R(r_maskinword), OP2R(R(r_maskinword))
+                         | RORI(ws->mask_bpp),               "MOV     r_maskinword,r_maskinword,ROR #mask_bpp");
+      SUB(R(r_maskinshift),R(r_maskinshift),
+                         S | IMM(ws->mask_bpp*2) | IMMROR(6),"SUBS    r_maskinshift,r_maskinshift,#mask_bpp:SHL:27 ; auto-resets itself to 0");
+      ins(ws, LDR(R(r_maskinword), R(r_maskinptr))
+            | EQ | WRITEBACK | OFFSET(4),                    "LDREQ   r_maskinword,[r_maskinptr,#4]!");
+    }
+    else
+    {
+      assert(!SOURCE_32_BIT, ERROR_FATAL);
+      ins(ws, LDR(R(r_maskinword),
+              R(r_inptr)) | EQ | INDEX(R(r_masko), 0),       "LDREQ   r_maskinword,[r_inptr,r_masko]");
+    }
+  }
+static void fetch_pixel_inc2(asm_workspace *wp, workspace *ws)
+/* Increment the pointer to the source pixel by two - only used in the 2-at-a-time
+ * optimised loop
+ */
+  comment(ws, "Advance source pointer by two pixels");
+  if (SOURCE_32_BIT)
+  {
+    ADD(R(r_inptr), R(r_inptr), IMM(8),                      "ADD     r_inptr,r_inptr,#8                ; past 2 32-bit pixels");
+  }
+  else if (SOURCED_16_BIT)
+  {
+    /* Two pixels per word - assured of loading a new word */
+    ins(ws, LDR(R(r_inword), R(r_inptr))
+          | WRITEBACK | OFFSET(4),                           "LDR     r_inword,[r_inptr,#4]!             ; past 2 16-bit pixels");
+  }
+  else
+  {
+    MOV(R(r_inword), OP2R(R(r_inword)) | RORI(ws->in_bpc*2), "MOV     r_inword,r_inword,ROR #in_bpc*2");
+      MOV(R(r_maskinword), OP2R(R(r_maskinword)) |
+                           RORI(ws->in_bpc*2),               "MOV     r_maskinword,r_maskinword,ROR #in_bpc*2");
+    SUB(R(r_inshift), R(r_inshift),
+        S | IMM(ws->in_bpc) | IMMROR(4),                     "SUBS    r_inshift,r_inshift,#in_bpc:SHL:28 ; auto-resets itself to 0");
+    ins(ws, LDR(R(r_inword), R(r_inptr))
+          | EQ | WRITEBACK | OFFSET(4),                      "LDREQ   r_inword,[r_inptr,#4]!             ; load more input pixels (inc2)");
+  }
+  {
+    {
+#if 0
+      MOV(R(r_maskinword), OP2R(R(r_maskinword))
+                         | RORI(ws->mask_bpp),                 "x"/*MOV     r_maskinword,r_maskinword,ROR #mask_bpp"*/);
+      SUB(R(r_maskinshift),R(r_maskinshift),
+                         S | IMM(ws->mask_bpp*2) | IMMROR(6),  "x"/*SUBS    r_maskinshift,r_maskinshift,#mask_bpp:SHL:27"*/);
+      ins(ws, LDR(R(r_maskinword), R(r_maskinptr))
+            | EQ | WRITEBACK | OFFSET(4),                     "x"/* "LDREQ   r_maskinword,[r_maskinptr,#4]!     ; load more mask pixels (inc2)"*/);
+      MOV(R(r_maskinword), OP2R(R(r_maskinword))
+                         | RORI(ws->mask_bpp),               "MOV     r_maskinword,r_maskinword,ROR #mask_bpp");
+      SUB(R(r_maskinshift),R(r_maskinshift),
+                         S | IMM(ws->mask_bpp*2) | IMMROR(6),"SUBS    r_maskinshift,r_maskinshift,#mask_bpp:SHL:27");
+      ins(ws, LDR(R(r_maskinword), R(r_maskinptr))
+            | EQ | WRITEBACK | OFFSET(4),                    "LDREQ   r_maskinword,[r_maskinptr,#4]!     ; load more mask pixels (inc2)");
+    }
+    else
+    {
+      assert(!SOURCE_32_BIT, ERROR_FATAL);
+      ins(ws, LDR(R(r_maskinword), R(r_inptr))
+              | EQ | INDEX(R(r_masko), 0),                   "LDREQ   r_maskinword,[r_inptr,r_masko]      ; load more mask pixels (inc2)");
+    }
+  }
+static void odither_inc(asm_workspace *wp, workspace *ws, int xy)
+/* Call every output pixel - alternates the ordered dither addition value
+ * xy == 0 for x, 1 for y
+ */
+  if (ws->odither)
+    EOR(R(r_oditheradd),R(r_oditheradd), IMM(1 << (ws->odither - xy)) | IMMROR(8),
+      xy == 0 ? "EOR     r_oditheradd,r_oditheradd,#odither_eorvalue ; alternate dither offset"
+              : "EOR     r_oditheradd,r_oditheradd,#odither_eorvalue:SHR:1 ; alternate dither offset");
+  UNUSED(wp);
+#if 1
+static void skip_current_output_words(asm_workspace *wp, workspace *ws)
+/* Skip over masked out words. r_xcount = output pixels to skip
+ *                             r_temp1   = pixels left in current word.
+ */
+  comment(ws, "4Skipping masked words.");
+  if (DEST_32_BIT)
+  {
+    ADD(R(r_outptr), R(r_outptr), R(r_xcount) | LSLI(2),          "4~ADD     r_outptr,r_outptr,r_xcount,LSL #2        ; skip 4*pixels bytes");
+    MOV(R(r_xcount), IMM(0),                                      "41MOV     r_xcount,#0");
+  }
+  else
+  {
+    SUB(R(r_xcount), R(r_xcount), OP2R(R(r_temp1)),               "4~SUB     r_xcount, r_xcount, r_temp1");
+    MOV(R(r_temp1),  OP2R(R(r_temp1)) | LSLI(ws->out_l2bpc),      "4~MOV     r_temp1, t_temp1, LSL #out_log2bpc");
+    MOV(R(r_outword), OP2R(R(r_outword)) | RORR(R(r_temp1)),      "4~MOV     r_outword,r_outword,ROR r_temp1");
+    ins(ws, STR(R(r_outword), R(r_outptr)) | POSTINC(4),          "4~STR     r_outword,[r_outptr],#4");
+    MOV(R(r_outshift), IMM(0),                                    "4~MOV     r_outshift, #0");
+    MOV(R(r_temp1), OP2R(R(r_xcount)) | S |LSRI(ws->out_l2ppw),   "4~~MOVS     r_temp1,r_xcount,LSR #out_log2ppw            ; whole words to skip");
+    ADD(R(r_outptr), R(r_outptr), NE | R(r_temp1) | LSLI(2),      "4~ADDNE    r_outptr,r_outptr,r_temp1,LSL #2             ; skip 4*pixels bytes");
+    ins(ws, LDR(R(r_outword), R(r_outptr)) | OFFSET(0),           "4~~LDR     r_outword,[r_outptr]");
+    SUB(R(r_xcount), R(r_xcount),
+                      OP2R(R(r_temp1)) | LSLI(ws->out_l2ppw),     "4~SUB     r_xcount, r_xcount, r_temp1 LSL #out_log2ppw ; pixels left to skip");
+  }
+static void skip_some_pixels(asm_workspace *wp, workspace *ws)
+/* Adjust outword and outshift back to start */
+    MOV(R(r_temp1),  OP2R(R(r_xcount)) | LSLI(ws->out_l2bpc),    "2~~MOV     r_temp1, r_xcount, LSL #out_log2bpc");
+    MOV(R(r_outword), OP2R(R(r_outword)) | RORR(R(r_temp1)),     "2~MOV     r_outword,r_outword,ROR r_temp1");
+    SUB(R(r_outshift), R(r_outshift),
+        OP2R(R(r_temp1)) | LSLI(27),                                   "2~~SUB    r_outshift,r_outshift,r_temp1,SHL #27");
+    MOV(R(r_xcount), IMM(0),                                      "31MOV     r_xcount,#0");
+    UNUSED(wp);
+static void save_pixel_inc(asm_workspace *wp, workspace *ws)
+/* Increment the pointer to the destination pixel */
+  comment(ws, "Advance destination pointer");
+  if (DEST_32_BIT)
+  {
+    ADD(R(r_outptr), R(r_outptr), IMM(4),                    "ADD     r_outptr,r_outptr,#4 323232");
+    if (PLOTMASK)
+    {
+#if 0
+      EOR(R(r_ecfindex), R(r_ecfindex), IMM(4),              "EOR     r_ecfindex,r_ecfindex,#4        ; either 0 or 4323232");
+/*      ins(ws, LDR(R(r_inword), R(r_inptr))
+            | INDEX(R(r_ecfindex), 0),                       "LDR     r_inword,[r_inptr,r_ecfindex]   ; load next word of ECF  32323");
+      ADD(R(r_ecfindex), R(r_ecfindex),
+            IMM(4),                                          "ADD     r_ecfindex,r_ecfindex,#4 132323");
+      ins(ws, LDR(R(r_bgcolour), R(r_inptr))
+            | INDEX(R(r_ecfindex), 0),                       "LDR     r_bgcolour,[r_inptr,r_ecfindex]   ; load next EOR word of ECF123232");
+      SUB(R(r_ecfindex), R(r_ecfindex),
+            IMM(4),                                          "SUB     r_ecfindex,r_ecfindex,#4 132323");
+*/    }
+  }
+  else
+  {
+    MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC),    "MOV     r_outword,r_outword,ROR #out_bpc    545454");
+    if (PLOTMASK)
+    {
+      MOV(R(r_inword), OP2R(R(r_inword)) | RORI(wp->BPC),    "MOV     r_inword,r_inword,ROR #out_bpc         ; advance ECF pattern    5");
+      MOV(R(r_bgcolour), OP2R(R(r_bgcolour)) | RORI(wp->BPC),    "MOV     r_bgcolour,r_bgcolour,ROR #out_bpc ; advance ECF eeyore pattern    5");
+    }
+    SUB(R(r_outshift), R(r_outshift),
+        S | IMM(wp->BPC*2) | IMMROR(6),                      "SUBS    r_outshift,r_outshift,#out_bpc:SHL:27        5");
+    ins(ws, STR(R(r_outword), R(r_outptr)) | EQ | POSTINC(4),"STREQ   r_outword,[r_outptr],#4        4");
+    if (ws->gcol == 0 && !SOURCE_MASK && !PLOTMASK)
+      MOV(R(r_outword), EQ | IMM(0),                         "MOVEQ   r_outword,#0                    ; setting pixels and no mask      4");
+    else
+      ins(ws, LDR(R(r_outword), R(r_outptr)) | EQ | OFFSET(0), "LDREQ   r_outword,[r_outptr]        4");
+    /* If entirely replacing pixels, no need to fetch the old ones.
+     * The last word has to be patched up carefully, see x_loop.
+     */
+    if (PLOTMASK)
+    {
+#if 0
+      EOR(R(r_ecfindex), R(r_ecfindex), EQ | IMM(4),         "EOREQ   r_ecfindex,r_ecfindex,#4        ; either 0 or 4");
+#if 0
+      ins(ws, LDR(R(r_inword), R(r_inptr))
+            | INDEX(R(r_ecfindex) | EQ, 0),                       "LDREQ   r_inword,[r_inptr,r_ecfindex]   ; load next word of ECF  1");
+      ADD(R(r_ecfindex), R(r_ecfindex),
+            IMM(4) | EQ,                                          "ADDEQ   r_ecfindex,r_ecfindex,#4  2");
+      ins(ws, LDR(R(r_bgcolour), R(r_inptr))
+            | INDEX(R(r_ecfindex) | EQ, 0),                       "LDREQ   r_bgcolour,[r_inptr,r_ecfindex]   ; load next EOR word of ECF2");
+      SUB(R(r_ecfindex), R(r_ecfindex),
+            IMM(4) | EQ,                                          "SUBEQ   r_ecfindex,r_ecfindex,#4 2");
+    }
+  }
+  odither_inc(wp, ws, 0);
+static void save_pixel_inc2(asm_workspace *wp, workspace *ws)
+/* Increment the pointer to the destination pixel by two. You are assured that
+ * a word fetch won't be necessary after the first of these. Only used in the
+ * optimised 2-at-a-time inner loop. You are assured that gcol==0.
+ */
+  comment(ws, "Advance destination pointer by two pixels");
+  if (DEST_32_BIT)
+    ADD(R(r_outptr), R(r_outptr), IMM(8),                    "ADD     r_outptr,r_outptr,#8");
+  else if (DESTD_16_BIT)
+  {
+    /* Two pixels per word - assured of saving a word, assured that gcol==0 and !SOURCE_MASK*/
+    ins(ws, STR(R(r_outword), R(r_outptr)) | POSTINC(4),     "STR     r_outword,[r_outptr],#4         ; store two pixels");
+    if (!SOURCE_MASK)
+      MOV(R(r_outword), IMM(0),                              "MOV     r_outword,#0                    ; setting pixels and no mask");
+    else
+      ins(ws, LDR(R(r_outword), R(r_outptr)) | OFFSET(0),    "LDR     r_outword,[r_outptr]            ; load dest data (in case of mask)");
+  }
+  else
+  {
+    MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC*2),  "MOV     r_outword,r_outword,ROR #out_bpc*2");
+    SUB(R(r_outshift), R(r_outshift),
+        S | IMM(wp->BPC) | IMMROR(4),                        "SUBS    r_outshift,r_outshift,#out_bpc:SHL:28");
+    ins(ws, STR(R(r_outword), R(r_outptr)) | EQ | POSTINC(4),"STREQ   r_outword,[r_outptr],#4         ; store pixels (inc2)");
+    if (!SOURCE_MASK)
+      MOV(R(r_outword), EQ | IMM(0),                         "MOVEQ   r_outword,#0                    ; setting pixels and no mask (inc2)");
+    else
+      ins(ws, LDR(R(r_outword), R(r_outptr)) | EQ | OFFSET(0), "LDREQ   r_outword,[r_outptr]            ; get dest data (in case of mask)");
+    /* If entirely replacing pixels, no need to fetch the old ones.
+     * The last word has to be patched up carefully, see x_loop.
+     */
+  }
+  odither_inc(wp, ws, 0); /* assume this has also been called once after the first pixel has been translated */
+static void plot_current_output_words(asm_workspace *wp, workspace *ws, int scale)
+/* plot multiple words of one pixel. r_xcount = output pixels to skip
+ *                                   r_temp1   = pixels left in current word.
+ *                                   r_pixel = pixel to output.
+ */
+  int loop;
+  comment(ws, "2Optimised plotting of scaled sprite.");
+  if (DEST_32_BIT)
+  {
+#if 1
+    ins(ws, STR(R(r_pixel),  R(r_outptr)) | POSTINC(4),      "32STR     r_pixel,[r_outptr],#4");
+    SUB(R(r_xcount), R(r_xcount),
+        S | IMM(1),                                         "14SUBS    r_xcount,r_xcount,#1");
+    if (scale < 21)
+    {
+      for (loop = 1;loop<scale;loop++)
+      {
+        ins(ws, STR(R(r_pixel), R(r_outptr)) | NE | POSTINC(4),      "32STRNE   r_pixel,[r_outptr],#4");
+        SUB(R(r_xcount), R(r_xcount),
+              S | NE | IMM(1),                                    "14SUBNES    r_xcount,r_xcount,#1");
+      }
+    }
+    else
+    {
+      CMP(R(r_xcount), IMM(10),                                    "CMP     r_xcount, #10");
+      branch(ws, B | LE, L(plot_loop1b),                           "BLE     plot_loop1b");
+      DEFINE_LABEL(plot_loop1a, "loop for every ten pixels")
+      for (loop = 0;loop<10;loop++)
+      {
+        ins(ws, STR(R(r_pixel), R(r_outptr)) | POSTINC(4),         "32STR   r_pixel,[r_outptr],#4");
+      }
+      SUB(R(r_xcount), R(r_xcount),
+          IMM(10),                                                 "14SUB    r_xcount,r_xcount,#10");
+      CMP(R(r_xcount), IMM(10),                                    "CMP     r_xcount, #10");
+      branch(ws, B | GT, L(plot_loop1a),                           "BGT     plot_loop1a");
+      DEFINE_LABEL(plot_loop1b, "branch here when LH side obscured")
+      CMP(R(r_xcount), IMM(0),                                     "CMP     r_xcount, #0");
+      for (loop = 0;loop<10;loop++)
+      {
+        ins(ws, STR(R(r_pixel), R(r_outptr)) | NE | POSTINC(4),      "4STRNE   r_pixel,[r_outptr],#4");
+        SUB(R(r_xcount), R(r_xcount),
+              S | NE | IMM(1),                                    "16SUBNES    r_xcount,r_xcount,#1");
+      }
+    }
+    for (loop = 0;loop<scale;loop++)
+      ins(ws, STR(R(r_pixel), R(r_outptr)) | POSTINC(4),      "32STR   r_outword,[r_outptr],#4");
+  }
+  else
+  {
+    SUB(R(r_xcount), R(r_xcount), OP2R(R(r_temp1)),             "52SUB     r_xcount, r_xcount, r_temp1");
+    DEFINE_LABEL(plot_loop1, "1???")
+    save_pixel_opt(wp, ws);
+    MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC),       "015MOV     r_outword,r_outword,ROR #out_bpc");
+    SUB(R(r_outshift), R(r_outshift),
+        S | IMM(wp->BPC*2) | IMMROR(6),                         "7SUBS    r_outshift,r_outshift,#out_bpc:SHL:27");
+    branch(ws, B | EQ, L(plot_loop1a),                           "BEQ     plot_loop1a");
+    save_pixel_opt(wp, ws);
+    MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC),       "115MOV     r_outword,r_outword,ROR #out_bpc");
+    SUB(R(r_outshift), R(r_outshift),
+        S | IMM(wp->BPC*2) | IMMROR(6),                         "17SUBS    r_outshift,r_outshift,#out_bpc:SHL:27");
+    branch(ws, B | EQ, L(plot_loop1b),                           "BEQ     plot_loop1b");
+    save_pixel_opt(wp, ws);
+    MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC),       "215MOV     r_outword,r_outword,ROR #out_bpc");
+    SUB(R(r_outshift), R(r_outshift),
+        S | IMM(wp->BPC*2) | IMMROR(6),                         "27SUBS    r_outshift,r_outshift,#out_bpc:SHL:27");
+    branch(ws, B | EQ, L(plot_loop1c),                           "BEQ     plot_loop1c");
+    save_pixel_opt(wp, ws);
+    MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC),       "315MOV     r_outword,r_outword,ROR #out_bpc");
+    SUB(R(r_outshift), R(r_outshift),
+        S | IMM(wp->BPC*2) | IMMROR(6),                         "37SUBS    r_outshift,r_outshift,#out_bpc:SHL:27");
+    branch(ws, B | NE, L(plot_loop1),                           "8BNE     plot_loop1");
+    DEFINE_LABEL(plot_loop1a, "plot loop 1a - coz only one forward referance allowed")
+    DEFINE_LABEL(plot_loop1b, "plot loop 1b - coz only one forward referance allowed")
+    DEFINE_LABEL(plot_loop1c, "plot loop 1c - coz only one forward referance allowed")
+    ins(ws, STR(R(r_outword), R(r_outptr)) | POSTINC(4),          "9STR     r_outword,[r_outptr],#4");
+    MOV(R(r_temp1), OP2R(R(r_xcount)) | S |LSRI(ws->out_l2ppw),   "0MOVS    r_temp1,r_xcount,LSR #out_log2ppw            ; whole words to skip");
+    branch(ws, B | EQ, L(plot_loop3),                             "1BEQ     plot_loop3");
+    for (loop = wp->BPP;loop<32;loop*=2)
+      ORR(R(r_pixel), R(r_pixel), OP2R(R(r_pixel)) | LSLI(loop),  "2ORR     r_pixel,r_pixel,r_pixel, LSL #somenumber");
+    DEFINE_LABEL(plot_loop2, "2???")
+    ins(ws, STR(R(r_pixel), R(r_outptr)) | POSTINC(4),            "3STR     r_pixel,[r_outptr],#4");
+    SUB(R(r_xcount), R(r_xcount),
+        IMM(ws->out_ppw),                                         "4SUB     r_xcount,r_xcount,#out_ppw");
+    SUB(R(r_temp1), R(r_temp1),
+        S | IMM(1),                                               "5SUBS    r_temp1,r_temp1,#1");
+    branch(ws, B | NE, L(plot_loop2),                             "6BNE     plot_loop2");
+    if (DESTD_16_BIT)
+    {
+      MOV(R(r_pixel), OP2R(R(r_pixel)) | LSLI(16),                "7MOV     r_pixel, r_pixel, LSL #16            ; whole words to skip");
+      MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRI(16),                "8MOV     r_pixel, r_pixel, LSR #16            ; whole words to skip");
+    }
+    else
+      AND(R(r_pixel), R(r_pixel), IMM(ws->out_dpixmask),          "9AND     r_pixel,r_pixel,#dpix_mask");
+    DEFINE_LABEL(plot_loop3, "3???")
+    ins(ws, LDR(R(r_outword), R(r_outptr)) | OFFSET(0),           "0LDR     r_outword,[r_outptr]");
+  }
+static void plot_some_pixels(asm_workspace *wp, workspace *ws)
+/* Non complete word pixel plot */
+    DEFINE_LABEL(plot_loop4, "4???")
+    save_pixel_opt(wp, ws);
+    MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC),       "14MOV     r_outword,r_outword,ROR #out_bpc");
+    SUB(R(r_outshift), R(r_outshift),
+        S | IMM(wp->BPC*2) | IMMROR(6),                          "15SUBS    r_outshift,r_outshift,#out_bpc:SHL:27");
+    SUB(R(r_xcount), R(r_xcount),
+        S | IMM(1),                                             "16SUBS    r_xcount, r_xcount, #1");
+    branch(ws, B | EQ, L(plot_loop4a),                          "17BEQ     plot_loop4a");
+    save_pixel_opt(wp, ws);
+    MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC),       "214MOV     r_outword,r_outword,ROR #out_bpc");
+    SUB(R(r_outshift), R(r_outshift),
+        S | IMM(wp->BPC*2) | IMMROR(6),                          "215SUBS    r_outshift,r_outshift,#out_bpc:SHL:27");
+    SUB(R(r_xcount), R(r_xcount),
+        S | IMM(1),                                             "216SUBS    r_xcount, r_xcount, #1");
+    branch(ws, B | EQ, L(plot_loop4b),                          "17BEQ     plot_loop4b");
+    save_pixel_opt(wp, ws);
+    MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC),       "314MOV     r_outword,r_outword,ROR #out_bpc");
+    SUB(R(r_outshift), R(r_outshift),
+        S | IMM(wp->BPC*2) | IMMROR(6),                          "315SUBS    r_outshift,r_outshift,#out_bpc:SHL:27");
+    SUB(R(r_xcount), R(r_xcount),
+        S | IMM(1),                                             "316SUBS    r_xcount, r_xcount, #1");
+    branch(ws, B | EQ, L(plot_loop4c),                          "17BEQ     plot_loop4c");
+    save_pixel_opt(wp, ws);
+    MOV(R(r_outword), OP2R(R(r_outword)) | RORI(wp->BPC),       "414MOV     r_outword,r_outword,ROR #out_bpc");
+    SUB(R(r_outshift), R(r_outshift),
+        S | IMM(wp->BPC*2) | IMMROR(6),                          "415SUBS    r_outshift,r_outshift,#out_bpc:SHL:27");
+    SUB(R(r_xcount), R(r_xcount),
+        S | IMM(1),                                             "416SUBS    r_xcount, r_xcount, #1");
+    branch(ws, B | NE, L(plot_loop4),                           "17BNE     plot_loop4");
+    DEFINE_LABEL(plot_loop4a, "plot loop 4a - coz only one forward referance allowed")
+    DEFINE_LABEL(plot_loop4b, "plot loop 4b - coz only one forward referance allowed")
+    DEFINE_LABEL(plot_loop4c, "plot loop 4c - coz only one forward referance allowed")
+*                                                                         *
+*    Bitblit: Overall construction of the X loop.                         *
+*                                                                         *
+#define ADD_A(reg,value) arbitrary_add(ws, TRUE, FALSE, &ws->regnames.reg, value);
+#define ADDS_A(reg,value) arbitrary_add(ws, TRUE, TRUE, &ws->regnames.reg, value);
+#define SUB_A(reg,value) arbitrary_add(ws, FALSE, FALSE, &ws->regnames.reg, value);
+#define SUBS_A(reg,value) arbitrary_add(ws, FALSE, TRUE, &ws->regnames.reg, value);
+static void arbitrary_add(workspace *ws, BOOL add, BOOL s, regname *r, int value)
+/* Add/subtract an arbitrary constant to a register - could be more than 8 bits. */
+  IFDEBUG(char a[256];)
+  if (value < 0) {value = -value; add = !add;}
+  if (value == 0) /* special case with 0 constant */
+  {
+    if (s)
+    {
+      IFDEBUG(do_sprintf(a, "CMP     %s,#0", r->name);)
+      CMP(r->regno, IMM(0), a);
+    }
+    /* else, nothing */
+  }
+  else
+  {
+    int opcode = add ? ADD_OPCODE : SUB_OPCODE;
+    int sopcode = s ? S : 0;
+    int shift_it = 0;
+    while (value != 0)
+    {
+      BOOL last;
+      int valuebyte;
+      if (value > 255)
+        while ((value & 3) == 0) {value >>= 2; shift_it += 2;}
+      valuebyte = value & 0xff;
+      value &= 0xffffff00;
+      last = value == 0; /* the last instruction needed */
+      IFDEBUG(
+        do_sprintf(a,
+          (last && sopcode ? "%sS%t8.%s,%s,#&%x" : "%s%t8.%s,%s,#&%x") _
+          (add ? "ADD" : "SUB") _ r->name _ r->name _ valuebyte << shift_it);)
+      ins(ws, opcode | (last ? sopcode : 0)
+            | DSTR(r->regno) | OP1R(r->regno)
+            | IMM(valuebyte) | IMMROR ((32 - shift_it) & 0x1e),
+            a);
+    }
+  }
+static void init_word_registers(asm_workspace *wp, workspace *ws)
+/* Initialise inword, outword, maskinword from their respective pointers
+ * and shift values.
+ */
+  comment(ws, "Load initial values of word registers");
+  /* Set up inword */
+  if (!PLOTMASK) /* PLOTMASK case handled below, because helped by setting up r_outword */
+  {
+    if (!SOURCE_32_BIT)
+    {
+      ins(ws, LDR(R(r_inword), R(r_inptr)) | OFFSET(0),         "LDR     r_inword,[r_inptr]              ; fetch first input pixels");
+      MOV(R(r_pixel), OP2R(R(r_inshift)) | LSRI(27),            "MOV     r_pixel,r_inshift,LSR #27       ; get real shift distance");
+      RSB(R(r_pixel), R(r_pixel), IMM(32),                      "RSB     r_pixel,r_pixel,#32             ; temporary use of r_pixel");
+      MOV(R(r_inword), OP2R(R(r_inword)) | RORR(R(r_pixel)),    "MOV     r_inword,r_inword,ROR r_pixel   "
+                                                              "; current input pixel now in least sig bit[s]");
+    }
+  }
+  if (SOURCE_MASK) /* Set up maskinword */
+  {
+    {
+      ins(ws, LDR(R(r_maskinword), R(r_maskinptr)) | OFFSET(0), "LDR     r_maskinword,[r_maskinptr]        ; fetch first mask word");
+      MOV(R(r_pixel), OP2R(R(r_maskinshift)) | LSRI(27),      "MOV     r_pixel,r_maskinshift,LSR #27     ; get real shift distance");
+      RSB(R(r_pixel), R(r_pixel), IMM(32),                    "RSB     r_pixel,r_pixel,#32         ; mask shift");
+    }
+    else
+      ins(ws, LDR(R(r_maskinword),
+              R(r_inptr)) | INDEX(R(r_masko), 0),             "LDR     r_maskinword,[r_inptr,r_masko]    ; fetch first mask word");
+    MOV(R(r_maskinword), OP2R(R(r_maskinword)) | RORR(R(r_pixel)),"MOV     r_maskinword,r_maskinword,ROR r_pixel "
+                                                              "; current mask pixel now in least sig bit[s]");
+  }
+  if (!DEST_32_BIT) /* Set up outword */
+  {
+    if (ws->gcol == 0 && !SOURCE_MASK && !PLOTMASK)
+    {
+      /* Faster in the inner loop, but the unneeded pixels must be cleared out first */
+      MOV(R(r_pixel), S | OP2R(R(r_outshift)) | LSRI(27),     "MOVS    r_pixel,r_outshift,LSR #27      ; get real shift distance");
+      ins(ws, NE | LDR(R(r_outword), R(r_outptr)) | OFFSET(0),  "LDRNE   r_outword,[r_outptr]            ; load up output word");
+      MOV(R(r_outword), NE | OP2R(R(r_outword))
+                      | LSLR(R(r_pixel)),                     "MOVNE   r_outword,r_outword,LSL r_pixel "
+                                                              "; set untouched pixels to correct places, clear the others");
+      MOV(R(r_outword), EQ | IMM(0),                          "MOVEQ   r_outword,#0                    ; if r_pixel=0, make them all clear");
+    }
+    else
+    {
+      ins(ws, LDR(R(r_outword), R(r_outptr)) | OFFSET(0),     "LDR     r_outword,[r_outptr]            ; load up output word");
+      MOV(R(r_pixel), OP2R(R(r_outshift)) | LSRI(27),         "MOV     r_pixel,r_outshift,LSR #27      ; get real shift distance");
+      RSB(R(r_pixel), R(r_pixel), IMM(32),                    "RSB     r_pixel,r_pixel,#32             ; temp use of r_pixel");
+      MOV(R(r_outword), OP2R(R(r_outword)) | RORR(R(r_pixel)),"MOV     r_outword,r_outword,ROR r_pixel "
+                                                              "; current output pixel now in least sig bit[s]");
+      /* Set up inword from ECF pattern - uses r_pixel value */
+      if (PLOTMASK)
+      {
+        ins(ws, LDR(R(r_inword), R(r_inptr))
+              | INDEX(R(r_ecfindex), 0),                      "LDR     r_inword,[r_inptr,r_ecfindex]   ; get ECF pattern word");
+        MOV(R(r_inword), OP2R(R(r_inword)) | RORR(R(r_pixel)),"MOV     r_inword,r_inword,ROR r_pixel  1 "
+                                                              "; current ECF pixel now in least sig bit[s]");
+        ADD(R(r_ecfindex), R(r_ecfindex),
+              IMM(4),                                           "ADD     r_ecfindex,r_ecfindex,#4        ; to load EOR word 1");
+        ins(ws, LDR(R(r_bgcolour), R(r_inptr))
+              | INDEX(R(r_ecfindex), 0),                        "LDR     r_bgcolour,[r_inptr,r_ecfindex]   ;fetch next EOR word of ECF1");
+        SUB(R(r_ecfindex), R(r_ecfindex),
+              IMM(4),                                           "SUB     r_ecfindex,r_ecfindex,#4        ;blah1");
+        MOV(R(r_bgcolour), OP2R(R(r_bgcolour)) | RORR(R(r_pixel)),"MOV     r_bgcolour,r_bgcolour,ROR r_pixel  1 ");
+      }
+    }
+  }
+static void loop_x(asm_workspace *wp, workspace *ws)
+/* The variables are set up - perform the inner loop that processes a
+ * single line. Fall out of the bottom of the loop when complete.
+ */
+  BOOL mask_possible;
+  comment(ws, "The inner loop: iterating along a row of pixels.");
+  if (x_block_move(wp, ws))
+  {
+    comment(ws, "Very simple inner loop - we use an existing block-move primitive");
+    MOV(R(lr), OP2R(R(pc)),                                  "MOV     lr,pc                           ; remember return address");
+    MOV(R(pc), OP2R(R(r_blockroutine)),                      "MOV     pc,r_blockroutine               ; block move");
+    /* It would be a little bit more efficient to do state saving here rather than inside the routine,
+     * and so only save registers that need to be saved - not a big saving, and only per-line.
+     */
+  }
+  else
+  {
+    init_word_registers(wp, ws);
+    if (simple_x_scale(wp, ws)) /* 1:1 scaling */
+    {
+      comment(ws, "1:1 scaling along x, so each source pixel is painted once");
+#if 0
+      align16(wp, ws);
+      DEFINE_LABEL(loop_x_repeat, "Loop around for each source/dest pixel")
+      mask_possible = fetch_pixel(wp, ws, &ws->labels.l_masked);
+      translate_pixel(wp, ws);
+      save_pixel(wp, ws);
+      if (mask_possible) DEFINE_LABEL(l_masked, "This pixel masked out")
+      fetch_pixel_inc(wp, ws);
+      save_pixel_inc(wp, ws);
+      SUB(R(r_xsize), R(r_xsize), S | IMM(1),                    "SUBS    r_xsize,r_xsize,#1");
+      branch(ws, B | NE, L(loop_x_repeat),                       "BNE     loop_x_repeat");
+      /* We generate a loop that does two pixels at a time, only advancing pointers, counts, shifts
+       * etc. every two pixels. There are two versions of this loop, one where the in and out shifts
+       * are 'in phase' (ie initially both even or both odd), one where they are out of phase. There
+       * is also some initial stuff to get the outshift to be even if necessary when entering either
+       * of these, and some final stuff to patch up the end.
+       */
+      comment(ws, "Optimised 2-at-a-time loop");
+      if (!DEST_32_BIT)
+      {
+        TST(R(r_outshift), IMM(wp->BPC*2) | IMMROR(6),           "TST     r_outshift,#out_bpc:SHL:27      ; start at odd or even pixel shift?");
+        branch(ws, B | EQ, L(x_evenstart),                       "BEQ     x_evenstart                     ; B if even");
+        comment(ws, "r_outshift an odd number of pixels - process just one of these");
+        mask_possible = fetch_pixel(wp, ws, &ws->labels.x_oddmask);
+        translate_pixel(wp, ws);
+        save_pixel(wp, ws);
+        if (mask_possible) DEFINE_LABEL(x_oddmask, "This pixel masked out")
+        fetch_pixel_inc(wp, ws);
+        save_pixel_inc(wp, ws);
+        SUB(R(r_xsize), R(r_xsize), S | IMM(1),                  "SUBS    r_xsize,r_xsize,#1              ; count towards overall width");
+        branch(ws, B | EQ, L(loop_x_exit),                       "BEQ     loop_x_exit                     ; check for just one pixel wide");
+        DEFINE_LABEL(x_evenstart, "r_outshift is an even number of pixels")
+      }
+      if (!SOURCE_32_BIT)
+      {
+        TST(R(r_inshift), IMM(ws->in_bpc*2) | IMMROR(6),         "TST     r_inshift,#in_bpc:SHL:27        ; input at odd or even pixel shift?");
+        branch(ws, B | NE, L(x_misaligned),                      "BNE     x_misaligned                    ; B if odd");
+      }
+      branch(ws, B, L(x_aligned_enter),                          "B       x_aligned_enter                 ; else, in phase with output - start loop");
+      newline();
+      align16(wp, ws);
+      DEFINE_LABEL(x_aligned_loop, "The 2-at-a-time inner loop, aligned case")
+      mask_possible = fetch_pixel(wp, ws, &ws->labels.x_alignmask1);
+      translate_pixel(wp, ws);
+      save_pixel(wp, ws);
+      if (mask_possible) DEFINE_LABEL(x_alignmask1, "First pixel masked out")
+      odither_inc(wp, ws, 0);
+      mask_possible = fetch_pixel2(wp, ws, &ws->labels.x_alignmask2);
+      translate_pixel(wp, ws);
+      save_pixel2(wp, ws);
+      if (mask_possible) DEFINE_LABEL(x_alignmask2, "Second pixel masked out")
+      fetch_pixel_inc2(wp, ws);
+      save_pixel_inc2(wp, ws);
+      DEFINE_LABEL(x_aligned_enter, "Entering the aligned 2-at-a-time inner loop")
+      SUB(R(r_xsize), R(r_xsize), S | IMM(2),                    "SUBS    r_xsize,r_xsize,#2              ; done 2 pixels");
+      branch(ws, B | GE, L(x_aligned_loop),                      "BGE     x_aligned_loop                  ; loop until 0 or 1 left");
+      if (!SOURCE_32_BIT)
+      {
+        branch(ws, B, L(x_2atatime_exit),                        "B       x_2atatime_exit                 ; final patchup code");
+        newline();
+        DEFINE_LABEL(x_misaligned, "The 2-at-a-time inner loop, misaligned case, entry sequence")
+        /* A bit delicate - we have to prepare the input stream for an inc2 call,
+         * by effectively winding it back by a pixel. We know this won't go back a word,
+         * however, because r_inshift is an odd number of pixels.
+         */
+        comment(ws, "Wind input stream back by a pixel");
+        if (SOURCE_32_BIT)
+          SUB(R(r_inptr), R(r_inptr), IMM(4),                    "SUB     r_inptr,r_inptr,#4              ; wind back a pixel");
+        else
+        {
+          MOV(R(r_inword), OP2R(R(r_inword)) | LSLI(ws->in_bpp), "MOV     r_inword,r_inword,LSL #in_bpp   ; wind back a pixel");
+          ADD(R(r_inshift), R(r_inshift),
+              IMM(ws->in_bpp*2) | IMMROR(6),                     "ADD     r_inshift,r_inshift,#in_bpp:SHL:27");
+        }
+        if (SOURCE_MASK)
+        {
+          MOV(R(r_maskinword), OP2R(R(r_maskinword))
+                             | LSLI(ws->mask_bpp),               "MOV     r_maskinword,r_maskinword,LSL #mask_bpp");
+          if (SOURCE_BPPMASK)
+            ADD(R(r_maskinshift), R(r_maskinshift),
+                IMM(ws->mask_bpp*2) | IMMROR(6),                 "ADD     r_maskinshift,r_maskinshift,#mask_bpp:SHL:27");
+        }
+        branch(ws, B, L(x_misaligned_enter),                     "B       x_misaligned_enter              ; start misaligned loop");
+        align16(wp, ws);
+        DEFINE_LABEL(x_misaligned_loop, "The 2-at-a-time inner loop, misaligned case")
+        mask_possible = fetch_pixel2(wp, ws, &ws->labels.x_misalignmask1);
+        translate_pixel(wp, ws);
+        save_pixel(wp, ws);
+        if (mask_possible) DEFINE_LABEL(x_misalignmask1, "A pixel masked out")
+        fetch_pixel_inc2(wp, ws);
+        odither_inc(wp, ws, 0);
+        mask_possible = fetch_pixel(wp, ws, &ws->labels.x_misalignmask2);
+        translate_pixel(wp, ws);
+        save_pixel2(wp, ws);
+        if (mask_possible) DEFINE_LABEL(x_misalignmask2, "Another pixel masked out")
+        save_pixel_inc2(wp, ws);
+        DEFINE_LABEL(x_misaligned_enter, "Entering the misaligned 2-at-a-time inner loop")
+        SUB(R(r_xsize), R(r_xsize), S | IMM(2),                  "SUBS    r_xsize,r_xsize,#2              ; count towards overall size");
+        branch(ws, B | GE, L(x_misaligned_loop),                 "BGE     x_misaligned_loop               ; and loop until done");
+        fetch_pixel_inc(wp, ws);
+        newline();
+        DEFINE_LABEL(x_2atatime_exit, "Final patchup for 2-at-a-time inner loop")
+      }
+      else
+        newline();
+      ADD(R(r_xsize), R(r_xsize), S | IMM(2),                    "ADDS    r_xsize,r_xsize,#2              ; up to 0 or 1");
+      branch(ws, B | EQ, L(loop_x_exit1),                        "BEQ     loop_x_exit1                    ; No last pixel to be done\n");
+      mask_possible = fetch_pixel(wp, ws, &ws->labels.x_lastmask);
+      translate_pixel(wp, ws);
+      save_pixel(wp, ws);
+      if (mask_possible) DEFINE_LABEL(x_lastmask, "Last pixel masked out")
+      fetch_pixel_inc(wp, ws);
+      save_pixel_inc(wp, ws);
+      DEFINE_LABEL(                                     loop_x_exit1, "End of input pixel line (1)")
+    }
+    else
+    {
+      comment(ws, "Control of scaling along x");
+      if (ws->odither && wp->save_xadd - wp->save_xdiv > wp->save_xdiv)
+      {
+        /* If dithering and scaling we have to be very careful about where we do fetch_pixel_inc, because when replicating
+         * a pixel we must repeatedly fetch_pixel it.
+         */
+        SUB_A(r_xcount, wp->save_xadd)
+        DEFINE_LABEL(                                       loop_x_repeat, "Loop around for each source pixel (ordered dither)")
+        ADD_A(r_xcount, wp->save_xadd)  /*(GPS)*/
+        mask_possible = fetch_pixel(wp, ws, &ws->labels.l_masked);
+        SUBS_A(r_xcount, wp->save_xdiv)  /* Stop dither from printing 1 too many pixels... (GPS) */
+        DEFINE_LABEL(                                       loop_put_pixel_repeat, "Repeatedly paint and ordered-dither a source pixel");
+        translate_pixel(wp, ws);
+        save_pixel(wp, ws);
+        save_pixel_inc(wp, ws);
+        SUB(R(r_xsize), R(r_xsize), S | IMM(1),                    "SUBS    r_xsize,r_xsize,#1              ; count output ordered dither pixels");
+        branch(ws, B | EQ, L(loop_x_exit),                         "BEQ     loop_x_exit                     ; painted enough pixels");
+        /* We must not paint the same pixel repeatedly - we must reextract and retranslate it, otherwise
+         * the dithering on scaled up pixels will not occur.
+         */
+        fetch_pixel_unmasked(wp, ws); /* reextract the pixel into r_pixel */
+        SUBS_A(r_xcount, wp->save_xdiv)  /* Decrement count (GPS) */
+        branch(ws, B | PL, L(loop_put_pixel_repeat),               "BPL     loop_put_pixel_repeat           ; recalculate and repaint");
+        fetch_pixel_inc(wp, ws); /* moved by (GPS) */
+        branch(ws, B, L(loop_x_repeat),                            "B       loop_x_repeat                   ; next input pixel");
+      }
+      else
+      {
+        if ( !PLOTMASK && (wp->save_xmag % wp->save_xdiv) == 0 && ((wp->save_xmag / wp->save_xdiv) > 4) && ws->gcol == 0)
+                 /* do optimised code */
+        {
+          register int toskip = wp->save_xmag / wp->save_xdiv;
+          tracef("in optimised scale\nxmag = %d, xdiv = %d, xmag mod xdiv = %d\n" _ wp->save_xmag _ wp->save_xdiv _ wp->save_xmag % wp->save_xdiv);
+          SUB_A(r_xcount, toskip)
+          DEFINE_LABEL(                                       loop_x_repeat, "3Loop around for each source pixel")
+          TEQ(R(r_xsize), IMM(0),                                       "3TEQ     r_xsize, #0");
+          DEFINE_LABEL(loop_x_exitskip,          "3Kludge to avoid multiple forward references");
+          branch(ws, B | EQ, L(loop_x_exit),                      "3BEQ     loop_x_exit");
+          ADD_A(r_xcount, toskip)
+          mask_possible = fetch_pixel(wp, ws, &ws->labels.l_masked);
+          translate_pixel(wp, ws); /* If we're about the discard the pixel this is in fact wasted work - we could reorganise
+                                    * this whole loop to improve that situation, but it doesn't really seem worthwhile, the gain
+                                    * is not enormous.
+                                    */
+          fetch_pixel_inc(wp, ws);
+          comment(ws, "3calculating number of times to plot pixel 1");
+          MOV(R(r_temp1), OP2R(R(r_xsize)),                            "3MOV     r_temp1, r_xsize               ; store r_xsize");
+          SUB(R(r_xsize), R(r_xsize), S | OP2R(R(r_xcount)),            "3SUBS    r_xsize, r_xsize, r_xcount  ; count output pixels");
+          MOV(R(r_xsize), MI | IMM(0),                                  "3MOVMI   r_xsize, #0                                          ");
+          MOV(R(r_xcount), MI | OP2R(R(r_temp1)),                       "3MOVMI   r_xcount, r_temp1                                          ");
+          if (!DEST_32_BIT)
+          {
+            MOV(R(r_temp1), S | OP2R(R(r_outshift)) | LSRI(27),           "3MOVS    r_temp1, r_outshift, LSR #27");
+            MOV(R(r_temp1), EQ | IMM(32),                                 "3MOVEQ   r_temp1, #32                    ; 0 in r_outshift => 32 bits left");
+            if (!DEST_1_BIT)
+              MOV(R(r_temp1), OP2R(R(r_temp1)) | LSRI(ws->out_l2bpc),       "3MOV     r_temp1, r_temp1, LSR #out_log2bpc");
+            CMP(R(r_xcount), OP2R(R(r_temp1)),                            "3CMP     r_xcount, r_temp1");
+            branch(ws, B + LT, L(loop2),                                  "3BLT     loop2                   ; end of this masked input pixel");
+          }
+          plot_current_output_words(wp, ws, toskip);
+          if (DEST_32_BIT)
+          {
+            branch(ws, B, L(loop_x_repeat),                          "11B     loop_x_repeat                   ; end of this masked input pixel");
+          }
+          else
+          {
+            TEQ(R(r_xcount), IMM(0),                                      "1TEQ     r_xcount, #0");
+            branch(ws, B + EQ, L(loop_x_repeat),                         "1BEQ     loop_x_repeat                   ; end of this masked input pixel");
+            DEFINE_LABEL(loop2, "Last word to plot")
+            plot_some_pixels(wp, ws);
+            branch(ws, B, L(loop_x_repeat),                              "1B       loop_x_repeat                   ; end of this masked input pixel");
+          }
+#if 0
+          int   loop;
+          comment(ws, "Doing multiple plots of same pixel in line");
+          DEFINE_LABEL(                                       loop_x_repeat, "Loop around for each source pixel")
+          CMN(R(pc), OP2R(R(pc)),                                 "CMN     pc, pc   ; this will clear the Z flag");
+          DEFINE_LABEL(loop_x_exitskip,          "Kludge to avoid multiple forward references");
+          branch(ws, B | EQ, L(loop_x_exit),                      "BEQ     loop_x_exit");
+          mask_possible = fetch_pixel(wp, ws, &ws->labels.l_masked);
+          translate_pixel(wp, ws); /* If we're about the discard the pixel this is in fact wasted work - we could reorganise
+                                    * this whole loop to improve that situation, but it doesn't really seem worthwhile, the gain
+                                    * is not enormous.
+                                    */
+          fetch_pixel_inc(wp, ws);
+          for (loop = 0;loop < (wp->save_xmag / wp->save_xdiv);loop++)
+          {
+            save_pixel(wp, ws);
+            save_pixel_inc(wp, ws);
+            SUB(R(r_xsize), R(r_xsize), S | IMM(1),                    "SUBS    r_xsize,r_xsize,#1              ; count for each output pixel");
+            branch(ws, B | EQ, L(loop_x_exitskip),                     "BEQ     loop_x_exitskip");
+          }
+          branch(ws, B , L(loop_x_repeat),                             "B     loop_x_repeat                   ; discard this pixel");
+        }
+        else
+        {
+          /* >>> There's not all that much point in this being separate from the odither case - could really
+           * abandon this one and use the ditering one all the time, with tiny variants. Not done.
+           */
+          SUB_A(r_xcount, wp->save_xadd)
+          DEFINE_LABEL(                                       loop_x_repeat, "Loop around for each source pixel")
+          ADD_A(r_xcount, wp->save_xadd)
+          mask_possible = fetch_pixel(wp, ws, &ws->labels.l_masked);
+          translate_pixel(wp, ws); /* If we're about the discard the pixel this is in fact wasted work - we could reorganise
+                                    * this whole loop to improve that situation, but it doesn't really seem worthwhile, the gain
+                                    * is not enormous.
+                                    */
+          fetch_pixel_inc(wp, ws);
+          DEFINE_LABEL(loop_put_pixel_repeat, "Loop around to repeatedly paint a source pixel");
+          SUBS_A(r_xcount, wp->save_xdiv)
+          branch(ws, B | MI, L(loop_x_repeat),                       "BMI     loop_x_repeat                   ; discard this pixel");
+          save_pixel(wp, ws);
+          save_pixel_inc(wp, ws);
+          SUB(R(r_xsize), R(r_xsize), S | IMM(1),                    "SUBS    r_xsize,r_xsize,#1              ; count for each output pixel");
+          branch(ws, B | NE, L(loop_put_pixel_repeat),               "BNE     loop_put_pixel_repeat");
+          branch(ws, B, L(loop_x_exit),                              "B       loop_x_exit              ; skip code for masked pixels");/* moved from next if (GPS) */
+        }
+      }
+      if (mask_possible)
+      {
+        DEFINE_LABEL(l_masked, "This source pixel masked out")
+        if (!PLOTMASK && (wp->save_xmag % wp->save_xdiv) == 0 && ((wp->save_xmag / wp->save_xdiv) > 4) && ws->gcol == 0)
+        {
+#if 1
+          fetch_pixel_inc(wp, ws);
+          comment(ws, "calculating number of times to plot pixel");
+          MOV(R(r_temp1), OP2R(R(r_xsize)),                             "@MOV     r_xtemp1, r_xsize               ; store r_xsize");
+          SUB(R(r_xsize), R(r_xsize), S | OP2R(R(r_xcount)),             "@SUBS    r_xsize, r_xsize, r_xcount  ; count output pixels");
+          MOV(R(r_xsize), MI | IMM(0),                                  "@MOVMI   r_xsize, #0                                          ");
+          MOV(R(r_xcount), MI | OP2R(R(r_temp1)),                       "@MOVMI   r_xcount, r_temp1                                          ");
+          if (!DEST_32_BIT)
+          {
+            MOV(R(r_temp1), S | OP2R(R(r_outshift)) | LSRI(27),           "@@MOVS    r_temp1, r_outshift, LSR #27");
+            MOV(R(r_temp1), EQ | IMM(32),                              "@@MOVEQ   r_temp1, #32                    ; 0 in r_outshift => 32 bits left");
+            if (!DEST_1_BIT)
+              MOV(R(r_temp1), OP2R(R(r_temp1)) | LSRI(ws->out_l2bpc),       "@@MOV     r_temp1, r_temp1, LSR #log2bpc");
+            CMP(R(r_xcount), OP2R(R(r_temp1)),                            "@@CMP     r_xcount, r_temp1");
+            branch(ws, B + LT, L(loop1),                                  "@@BLT     loop1                   ; end of this masked input pixel");
+          }
+          skip_current_output_words(wp, ws);
+          if (DEST_32_BIT)
+          {
+            branch(ws, B, L(loop_x_repeat),                         "1@B     loop_x_repeat                   ; end of this masked input pixel");
+          }
+          else
+          {
+            TEQ(R(r_xcount), IMM(0),                                     "1@TEQ     r_xcount, #0");
+            branch(ws, B + EQ, L(loop_x_repeat),                        "1@BEQ     loop_x_repeat                   ; end of this masked input pixel");
+            DEFINE_LABEL(loop1, "Last word to skip")
+            skip_some_pixels(wp, ws);
+            branch(ws, B, L(loop_x_repeat),                            "1@@B       loop_x_repeat                   ; end of this masked input pixel");
+          }
+          int loop;
+          fetch_pixel_inc(wp, ws);
+          for (loop = 0;loop < (wp->save_xmag / wp->save_xdiv);loop++)
+          {
+            save_pixel_inc(wp, ws);
+            SUB(R(r_xsize), R(r_xsize), S | IMM(1),                    "SUBS    r_xsize,r_xsize,#1              ; count output pixels");
+            branch(ws, B | EQ, L(loop_x_exitskip),              "BEQ     loop_x_exitskip");
+          }
+          branch(ws, B, L(loop_x_repeat),                       "B       loop_x_repeat                   ; end of this masked input pixel");
+        }
+        else
+        {
+          fetch_pixel_inc(wp, ws);
+          DEFINE_LABEL(loop_put_masked_repeat, "Loop around to skip over dest pixels");
+          SUBS_A(r_xcount, wp->save_xdiv)
+          branch(ws, B | MI, L(loop_x_repeat),                       "BMI     loop_x_repeat                   ; end of this masked input pixel");
+          save_pixel_inc(wp, ws);
+          SUB(R(r_xsize), R(r_xsize), S | IMM(1),                    "SUBS    r_xsize,r_xsize,#1              ; count output pixels");
+          branch(ws, B | NE, L(loop_put_masked_repeat),              "BNE     loop_put_masked_repeat");
+        }
+      }
+    }
+    DEFINE_LABEL(                                     loop_x_exit, "End of input pixel line")
+    newline();
+    if (!DEST_32_BIT)
+    {
+      comment(ws, "End of x loop - ensure any contents of r_outword are written out.");
+      MOV(R(r_outshift), S | OP2R(R(r_outshift)) | LSRI(27),     "MOVS    r_outshift,r_outshift,LSR #27   ; get real output shift distance");
+      MOV(R(r_outshift), EQ | IMM(32),                           "MOVEQ   r_outshift,#32                  "
+                                                                 "; number of useful new bits in r_outword");
+      if (ws->gcol == 0 && !SOURCE_MASK)
+      {
+        /* If setting pixels we must pick up the word we're about to
+         * partially overwrite, and combine the new and old pixels.
+         */
+        comment(ws, "The top 32-r_outshift bits of r_outword are new pixels.");
+        MOV(R(r_outword), OP2R(R(r_outword)) | LSRR(R(r_outshift)),"MOV     r_outword,r_outword,LSR r_outshift ; get new pixels in correct place");
+        ins(ws, LDR(R(r_pixel), R(r_outptr)) | OFFSET(0),        "LDR     r_pixel,[r_outptr]              ; temporary use of r_pixel");
+        RSB(R(r_outshift), R(r_outshift), IMM(32),               "RSB     r_outshift,r_outshift,#32");
+        MOV(R(r_pixel), OP2R(R(r_pixel)) | LSRR(R(r_outshift)),  "MOV     r_pixel,r_pixel,LSR r_outshift  ; shift to clear out old pixels");
+        ORR(R(r_outword), R(r_outword),
+              OP2R(R(r_pixel)) | LSLR(R(r_outshift)),            "ORR     r_outword,r_outword,r_pixel, LSL r_outshift ; combine old and new");
+        ins(ws, STR(R(r_outword), R(r_outptr)) | OFFSET(0),      "STR     r_outword,[r_outptr]            ; store updated word");
+      }
+      else
+      {
+        MOV(R(r_outword), OP2R(R(r_outword)) | RORR(R(r_outshift)),"MOV     r_outword,r_outword,ROR r_outshift");
+        ins(ws, STR(R(r_outword), R(r_outptr)) | OFFSET(0),        "STR     r_outword,[r_outptr]");
+      }
+    }
+  }
+*                                                                         *
+*    Bitblit: Overall construction of the Y loop.                         *
+*                                                                         *
+static void loop_y(asm_workspace *wp, workspace *ws, j_decompress_ptr cinfo)
+/* Overall control of the code and outer loop */
+  /* Declare the registers */
+  int yrn;
+  int x_loop_save_mask;
+  int y_loop_save_mask;
+  int ptrs_save_mask;    /* r_inptr, r_outptr, and (if it exists) r_maskinptr */
+  int x_loop_save_size;
+  int ptrs_save_size;
+  BOOL onebank; /* one bank of registers, or two */
+#ifdef DEBUG
+  char xregs[256];
+  char yregs[256];
+  char ptrregs[256];
+  char a[256];
+  /* Various useful constants not provided directly by wp. */
+  newline();
+  comment(ws, "Various useful constants");
+    comment(ws, "Double-pixel input - pixels are not the same as double-pixels");
+  else
+    comment(ws, "Not double-pixel input - pixels are exactly the same as double-pixels");
+  ws->in_bpp         = 1 << wp->save_inlog2bpp;
+  ws->in_bpc         = 1 << wp->save_inlog2bpc;
+  ws->in_pixmask     = (1 << ws->in_bpp) - 1;
+  tracef("%t20.in_bpp  *       %i %t68; bits per input pixel\n" _ ws->in_bpp);
+  tracef("%t20.in_bpc  *       %i %t68; bits per input double-pixel ('character')\n" _ ws->in_bpc);
+  if (ws->in_bpp <= 8) tracef("%t20.in_pixmask *    %i %t68; input pixel mask\n" _ ws->in_pixmask);
+  {
+    if (SOURCE_BPPMASK) /* a bit mask */
+    {
+      ws->mask_bpp     = 1;
+      ws->mask_bpc     = 1;
+      ws->mask_pixmask = 1;
+    }
+    else
+    {
+      ws->mask_bpp     = ws->in_bpp;
+      ws->mask_bpc     = ws->in_bpc;
+      ws->mask_pixmask = ws->in_pixmask;
+    }
+    tracef("%t20.mask_bpp *      %i %t68; bits per mask pixel\n" _ ws->mask_bpp);
+    tracef("%t20.mask_bpc *      %i %t68; bits per mask double-pixel\n" _ ws->mask_bpc);
+    tracef("%t20.mask_pixmask *  %i %t68; mask pixel mask\n" _ ws->mask_pixmask);
+  }
+  else
+    comment(ws, "No input mask");
+    comment(ws, "Double-pixel output - pixels are not the same as double-pixels");
+  else
+    comment(ws, "Not double-pixel output - pixels are exactly the same as double-pixels");
+  ws->out_l2ppw      = 5 - ws->out_l2bpc;
+  ws->out_ppw        = 1 << ws->out_l2ppw;
+  ws->out_pixmask    = (1 << wp->BPP) - 1;
+  ws->out_dpixmask   = (1 << wp->BPC) - 1;
+  tracef("%t20.out_bpp *       %i %t68; bits per output pixel\n" _ wp->BPP);
+  tracef("%t20.out_bpc *       %i %t68; bits per output double-pixel\n" _ wp->BPC);
+  tracef("%t20.out_l2bpp *     %i %t68; log base 2 of bits per output pixel\n" _ ws->out_l2bpp);
+  tracef("%t20.out_l2bpc *     %i %t68; log base 2 of bits per output double-pixel\n" _ ws->out_l2bpc);
+  tracef("%t20.out_ppw *       %i %t68; double-pixels per output word\n" _ ws->out_ppw);
+  tracef("%t20.out_l2ppw *     %i %t68; log base 2 of double-pixels per output word\n" _ ws->out_l2ppw);
+  if (wp->BPC <= 8)
+  {
+    tracef("%t20.out_pixmask *   %i %t68; output pixel mask\n" _ ws->out_pixmask);
+    tracef("%t20.out_dpixmask *  %i %t68; output double-pixel mask\n" _ ws->out_dpixmask);
+  }
+  /* Setting up ordered dither, if required */
+  if (  !PLOTMASK                     /* if plotting sprite */
+     && ws->in_bpp >= 16              /* from true colour source */
+     && wp->BPP < ws->in_bpp          /* and losing resolution */
+     && (wp->dither_truecolour & 1)
+     && !(wp->is_it_jpeg && (wp->dither_truecolour & 2))
+     )
+  {
+    tracef("in dither_truecolour = %x\n" _ wp->dither_truecolour);
+    comment(ws, "Ordered dither being used");
+    /* If not 0 then ws->odither is the number of bits - 1 being truncated from 8-bit source colour values */
+    if (wp->BPP == 16) /* dithering down from 32 bit to 16 bit */
+      ws->odither = 2;
+    else  /* dithering down from 16 or 32 bit, to 1/2/4/8 bit. */
+    {
+      if (ws->out_l2bpp == 3) /* 8bpp */
+      {
+        if (wp->is_it_jpeg && cinfo->jpeg_color_space == JCS_GRAYSCALE)
+          ws->odither = 3; /* dither assuming 4 bits of grey represented */
+        else
+          ws->odither = 4; /* seems to work better for colour than 3, which is what you might expect if
+                            * you were assuming 4 bits of colour per gun. In other words, the tint is NOT
+                            * effective enough at representing the next two bits of colour output!
+                            * If the source is known to be greyscale then 3 is a better value.
+                            */
+      }
+      else
+        ws->odither = 6 - ws->out_l2bpp; /* 6, 5 or 4 for 2, 4, or 16 colour output (2, 4 or 8 grey level) */
+    }
+    tracef("%t20.odither_eorvalue * 1:SHL:(24+%i) %t68; value to EOR into r_oditheradd each pixel" _ ws->odither);
+  }
+    tracef("out dither_truecolour = %x\n" _ wp->dither_truecolour);
+  newline();
+  ins(ws, PUSH | 0x5fff,                                    "STMDB   sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} ; save entry registers");
+  newline();
+  comment(ws, "Register declarations");
+  if (wp->is_it_jpeg)
+  {
+    ws->leave_r12_alone = TRUE;
+    comment(ws, "Leave r12 unallocated, it contains the assembler module workspace pointer");
+  }
+  ptrs_rn(wp, ws);
+  ptrs_save_mask = (1<<ws->next_free_reg) - 1;
+  IFDEBUG(ldm_reg_list(ws, ptrregs, ptrs_save_mask, FALSE);)
+  ptrs_save_size = SOURCE_BPPMASK || PLOTMASK ? 12 : 8;
+  if (ws->odither) ptrs_save_size += 4;
+  xloop_rn(wp, ws);
+  x_loop_save_mask = ((1<<ws->next_free_reg) - 1) & ~ptrs_save_mask;
+  x_loop_save_size = 4 * ws->next_free_reg - ptrs_save_size;      /* size in bytes, used right at the end */
+  /* Of the x-loop variables, no need to save r_inword/outword/maskinword/temp1/temp2 - set up every time round */
+  if (ws->regnames.r_inword.regno != -1) {x_loop_save_mask &= ~(1 << ws->regnames.r_inword.regno); x_loop_save_size -= 4;}
+  if (ws->regnames.r_outword.regno != -1) {x_loop_save_mask &= ~(1 << ws->regnames.r_outword.regno); x_loop_save_size -= 4;}
+  if (ws->regnames.r_maskinword.regno != -1) {x_loop_save_mask &= ~(1 << ws->regnames.r_maskinword.regno); x_loop_save_size -= 4;}
+  if (ws->regnames.r_temp1.regno != -1) {x_loop_save_mask &= ~(1 << ws->regnames.r_temp1.regno); x_loop_save_size -= 4;}
+  if (ws->regnames.r_temp2.regno != -1) {x_loop_save_mask &= ~(1 << ws->regnames.r_temp2.regno); x_loop_save_size -= 4;}
+  IFDEBUG(ldm_reg_list(ws, xregs, x_loop_save_mask, FALSE);)
+  yrn = yloop_rn_count(wp, ws);
+  onebank = yrn + ws->next_free_reg + ws->leave_r12_alone <= 13;
+  comment(ws, onebank ? "The y loop variables will fit in registers too"
+                      : "The y loop variables are overlaid on the x ones");
+  if (!onebank) ws->next_free_reg = 4; /* Overlay the x-loop register allocations - but not ptr registers */
+  yloop_rn(wp, ws);
+  if (!onebank) /* If two banks, be prepared to do LDM/STM for the y-loop bank */
+  {
+    y_loop_save_mask = ((1<<ws->next_free_reg) - 1) & 0xfffffff0; /* not regs 0..3 */
+    IFDEBUG(ldm_reg_list(ws, yregs, y_loop_save_mask, TRUE);)
+  }
+  newline();
+  comment(ws, "Load up initial values of x-loop variables");
+  fetch_pixel_init(wp, ws);
+  save_pixel_init(wp, ws);
+  xloop_init(wp, ws);
+  tracef("%t20.x_loop_save_size * %t28%i %t68.; Bytes of stack for x-loop variables\n" _ x_loop_save_size);
+  tracef("%t20.ptrs_save_size * %t28%i %t68.; Bytes of stack for ptr variables\n" _ ptrs_save_size);
+  comment(ws, "Save x-loop and pointer variables on the stack");
+  IFDEBUG(do_sprintf(a, "STMDB   sp!,{%s,%s}", ptrregs,xregs);)
+  /* Added by (GPS) to get round spilled reg bug. */
+  if(ws->odither && SOURCE_16_BIT)
+  {
+    tracef("x_loop_save_mask = %x\n" _ x_loop_save_mask);
+    x_loop_save_mask &= ~(1<<(ws->regnames.r_xcount.regno));
+    x_loop_save_mask |= (1<<(ws->regnames.r_pixel.regno));
+    tracef("x_loop_save_mask = %x\n" _ x_loop_save_mask);
+    ins(ws, PUSH | x_loop_save_mask | ptrs_save_mask, a);
+    x_loop_save_mask |= (1<<(ws->regnames.r_xcount.regno));
+    x_loop_save_mask &= ~(1<<(ws->regnames.r_pixel.regno));
+    comment(ws, "r_pixel pushed instead of x-count");
+#ifdef DEBUG
+    tracef("x_loop_save_mask = %x\n" _ x_loop_save_mask);
+  }
+  else
+  {
+    ins(ws, PUSH | x_loop_save_mask | ptrs_save_mask, a);
+  }
+  /* end added code...*/
+  newline();
+  comment(ws, "Load up initial values of y-loop variables");
+  yloop_init(wp, ws);
+  if(ws->odither && SOURCE_16_BIT)
+  {
+    MOV(R(r_xcount), OP2R(R(r_pixel)),                              "MOV     r_xcount,r_pixel                           ; set r_xcount to correct value");
+  }
+  if (!simple_y_scale(wp, ws)) /* If not simple scaling, might not paint the first row */
+    branch(ws, B, L(y_loop_enter),                          "B       y_loop_enter                    ; enter the main loop");
+  /* Top of the y-loop */
+  newline();
+  DEFINE_LABEL(y_loop,                        "Loop around for each row")
+  if (!simple_y_scale(wp, ws))
+  {
+    comment(ws, "At this point the ptr registers have been updated but not saved");
+    IFDEBUG(do_sprintf(a,                                      "STMIA   sp,{%s}", ptrregs);)
+    ins(ws, STMIA(R(sp)) | ptrs_save_mask, a);
+  }
+  if (wp->is_it_jpeg)
+  {
+    comment(ws, "r_inptr is the source y coord for JPEG data: convert to data pointer");
+    comment(ws, "fetchroutine uses r_inptr(=r0), r12. On output r_inptr=source result pointer");
+    MOV(R(lr), OP2R(R(pc)),                                 "MOV     lr,pc                           ; remember return address from fetchroutine");
+    MOV(R(pc), OP2R(R(r_fetchroutine)),                     "MOV     pc,r_fetchroutine               ; get source address");
+    LDR_WP_C(lr, in_x, "returned value is for base of line - add initial offset")
+    if (wp->save_inlog2bpp < 5)
+    {
+      if (wp->save_inlog2bpp == 3)
+        ADD(R(r_inptr),R(r_inptr),OP2R(R(lr)),                "ADD     r_inptr,r_inptr,lr              ; add in_x as byte offset");
+      else
+        ADD(R(r_inptr),R(r_inptr),OP2R(R(lr)) | LSLI(1),      "ADD     r_inptr,r_inptr,lr,LSL#1        ; add in_x as halfword offset");
+      BIC(R(r_inptr),R(r_inptr),IMM(3),                     "BIC     r_inptr,r_inptr,#3              ; r_inptr is a word pointer");
+    }
+    else
+      ADD(R(r_inptr),R(r_inptr),OP2R(R(lr)) | LSLI(2),      "ADD     r_inptr,r_inptr,lr,LSL#2        ; add in_x as word offset");
+  }
+  if (!onebank)
+  {
+    /* the x-loop variables are already set up, with inptr/outptr/maskinptr saved at new values */
+    ADD(R(lr), R(sp), IMM(ptrs_save_size),                  "ADD     lr,sp,#ptrs_save_size");
+    IFDEBUG(do_sprintf(a,                                      "STMDB   sp!,{%s} %t40; push y-loop variables", yregs);)
+    ins(ws, PUSH | y_loop_save_mask, a);
+    IFDEBUG(do_sprintf(a,                                      "LDMIA   lr,{%s} %t40; load x-loop variables", xregs);)
+    ins(ws, LDMIA(R(lr)) | x_loop_save_mask, a); /* Reload the x-loop variables */
+  }
+  newline();
+  /* Generate the inner loop. */
+  loop_x(wp, ws);
+  /* Suitable register 'bank' swapping. */
+  if (onebank)
+  {
+    IFDEBUG(do_sprintf(a,                                      "LDMIA   sp,{%s,%s} %t40; reload x-loop and ptr registers", ptrregs, xregs);)
+    ins(ws, LDMIA(R(sp)) | x_loop_save_mask | ptrs_save_mask, a);
+  }
+  else
+  {
+    IFDEBUG(do_sprintf(a,                                      "LDMIA   sp!,{%s} %t40; pop y-loop variables", yregs);)
+    ins(ws, POP | y_loop_save_mask, a);
+    newline();
+    comment(ws, "Reload pointers to the start of a row");
+    IFDEBUG(do_sprintf(a,                                      "LDMIA   sp,{%s} %t40; reload ptr registers", ptrregs);)
+    ins(ws, LDMIA(R(sp)) | ptrs_save_mask, a);
+  }
+  /* Control of scaling in the y direction */
+  if (simple_y_scale(wp, ws))
+  {
+    comment(ws,                                      "1:1 scaling in y direction - each source row appears once");
+    if (!PLOTMASK)
+    {
+      if (wp->is_it_jpeg)
+        ADD(R(r_inptr), R(r_inptr), IMM(1),                 "ADD     r_inptr,r_inptr,#1               ; inc y coord of input JPEG data");
+      else
+        SUB(R(r_inptr), R(r_inptr), OP2R(R(r_inoffset)),    "SUB     r_inptr,r_inptr,r_inoffset");
+    }
+    SUB_A(r_outptr,wp->save_outoffset)                     /*SUB     r_outptr,r_outptr,#outoffset*/
+    odither_inc(wp, ws, 1); /* advance to next coord */
+    odither_inc(wp, ws, 0); /* ensure X coord phase alternates on alternate lines */
+      SUB(R(r_maskinptr), R(r_maskinptr),
+          OP2R(R(r_maskinoffset)),                          "SUB     r_maskinptr,r_maskinptr,r_maskinoffset");
+    IFDEBUG(do_sprintf(a,                                      "STMIA   sp,{%s} %t40.; Save updated ptr registers", ptrregs);)
+    ins(ws, STMIA(R(sp)) | ptrs_save_mask, a);
+    SUB(R(r_ysize), R(r_ysize), S | IMM(1),                 "SUBS    r_ysize,r_ysize,#1              ; decrement output pixel size");
+    branch(ws, B | GT, L(y_loop),                           "BGT     y_loop");
+  }
+  else
+  {
+    SUB(R(r_ysize), R(r_ysize), S | IMM(1),                 "SUBS    r_ysize,r_ysize,#1");
+    branch(ws, B | LE, L(y_loop_exit),                      "BLE     y_loop_exit");
+    SUB_A(r_outptr,wp->save_outoffset)                     /*SUB     r_outptr,r_outptr,#outoffset*/
+    odither_inc(wp, ws, 1);
+    odither_inc(wp, ws, 0);
+    if (PLOTMASK)
+    {
+      comment(ws,                                    "Advance ECF pointer");
+      LDR_WP(r_pixel, save_ecflimit);                      /*LDR     r_pixel,save_ecflimit*/
+      CMP(R(r_inptr), OP2R(R(r_pixel)),                     "CMP     r_inptr,r_pixel                 ; check for bottom of ECF");
+      ADD(R(r_inptr), R(r_inptr), EQ | IMM(64),             "ADDEQ   r_inptr,r_inptr,#64             ; and if reached, reset to top");
+      SUB(R(r_inptr), R(r_inptr), IMM(8),                   "SUB     r_inptr,r_inptr,#8              ; points to base of current row of ECF");
+    }
+    comment(ws,                                      "Control of scaling in y direction");
+    DEFINE_LABEL(                                    y_loop_enter,  "Initial entry into the loop")
+    SUBS_A(r_ycount, wp->save_ydiv)                        /*SUBS    r_ycount,r_ycount,#ydiv*/
+    branch(ws, B | PL, L(y_loop),                           "BPL     y_loop                          ; if count>=0 then B else next source row");
+    if (!PLOTMASK)
+    {
+      if (wp->is_it_jpeg)
+        ADD(R(r_inptr), R(r_inptr), IMM(1),                 "ADD     r_inptr,r_inptr,#1              ; inc y coord of source JPEG data");
+      else
+        SUB(R(r_inptr), R(r_inptr), OP2R(R(r_inoffset)),    "SUB     r_inptr,r_inptr,r_inoffset      ; next source row");
+    }
+      SUB(R(r_maskinptr), R(r_maskinptr),
+          OP2R(R(r_maskinoffset)),                          "SUB     r_maskinptr,r_maskinptr,r_maskinoffset ; advance input mask pointer");
+    ADD_A(r_ycount, wp->save_ydiv + wp->save_yadd)         /*ADD     r_ycount,r_ycount,#(ydiv+yadd)*/
+    branch(ws, B, L(y_loop_enter),                          "B       y_loop_enter                    ; reenter the main loop");
+    DEFINE_LABEL(y_loop_exit,                  "Exit from y loop")
+  }
+  newline();
+  comment(ws, "Discard workspace, restore registers, and exit");
+  ADD(R(sp), R(sp), IMM(x_loop_save_size+ptrs_save_size),   "ADD     sp,sp,#x_loop_save_size+ptrs_save_size ; discard saved x-loop variables");
+  ins(ws, POP | 0x5fff,                                     "LDMIA   sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} ; restore, exit");
+  MOV(R(pc), OP2R(R(lr)),         "MOV   pc, lr");
+*                                                                         *
+*    Bitblit: The main compiler entry points.                             *
+*                                                                         *
+static blitter find_or_compile_code(asm_workspace *wp, workspace *ws, j_decompress_ptr cinfo)
+/* Based on the workspace variables look through existing compiled buffers for an existing match */
+  code_buffer *p;
+  int key_word;
+  key_word = wp->save_inlog2bpp          /* 0..2 */
+               + (ws->out_l2bpp << 3)    /* 3..5 */
+               + (ws->gcol << 6);        /* 6..8 */
+  if (SOURCE_MASK) key_word |= 1<<9;
+  if (SOURCE_BPPMASK) key_word |= 1<<10;
+  if (wp->trns_palette != 0) key_word |= 1<<11;
+  if (wp->ColourTTR != 0) key_word |= 1<<12;
+  if (wp->BPP != wp->BPC) key_word |= 1<<13;
+  if (wp->save_inlog2bpp != wp->save_inlog2bpc) key_word |= 1<<14;
+  if (PLOTMASK) key_word |= 1<<15;
+#ifdef ASMjpeg
+  if (wp->is_it_jpeg) key_word |= 1<<16;
+  if (wp->is_it_jpeg && cinfo->jpeg_color_space == JCS_GRAYSCALE) key_word |= 1<<17;
+  if (wp->cal_table)
+  {
+    key_word |= 1<<18;
+    if (ws->cal_table_simple) key_word |= 1<<19;
+    if (wp->cal_table->tablecount == 3) key_word |= 1<<20;
+  }
+#ifdef ASMjpeg
+  if (wp->is_it_jpeg && (wp->dither_truecolour & 1)) key_word |= 1<<21;
+  if (wp->is_it_jpeg && (wp->dither_truecolour & 2)) key_word |= 1<<22;
+  tracef("Searching for compiled code for key_word=%x, scale=%i:%i,%i:%i outoffset=%x.\n" _
+    key_word _ wp->save_xadd - wp->save_xdiv _ wp->save_xdiv _ wp->save_yadd _ wp->save_ydiv _ wp->save_outoffset);
+  tracef("simple_x_scale=%s x_block_move=%s jpeg=%s calibration table=0x%x\n"
+        _ whether(simple_x_scale(wp, ws))
+        _ whether(x_block_move(wp, ws))
+        _ whether(wp->is_it_jpeg)
+        _ wp->cal_table);
+    if (  p->key_word == key_word
+       && p->xadd == wp->save_xadd
+       && p->xdiv == wp->save_xdiv
+       && p->yadd == wp->save_yadd
+       && p->ydiv == wp->save_ydiv
+       && p->outoffset == wp->save_outoffset
+       )
+     {
+       tracef("Found existing compiled code in buffer %x.\n" _ p);
+       return (blitter)p->code;
+     }
+  p = &ws->buffers[ws->build_buffer];
+  p->key_word = -1; /* Not set unless we complete the compilation - see below */
+  p->xadd = wp->save_xadd;
+  p->xdiv = wp->save_xdiv;
+  p->yadd = wp->save_yadd;
+  p->ydiv = wp->save_ydiv;
+  p->outoffset = wp->save_outoffset;
+  tracef("Compiler initialised for buffer at %x.\n" _ p);
+  compile_buffer_init(wp, ws);
+  /* Now we actually do the compile */
+  loop_y(wp, ws, cinfo);
+  compile_buffer_done(ws);
+  p->key_word = key_word;
+  /* Just did some dynamic code generation so flush the I cache */
+  _swix(OS_SynchroniseCodeAreas, _IN(0) | _IN(1) | _IN(2), 1,
+        (int)ws->compile_base, (int)ws->compile_base + ((BUFSIZE - 1 /* Inclusive */) * sizeof(int)));
+  return (blitter)ws->compile_base;
+blitter putscaled_compiler(asm_workspace *wp, workspace *ws, workspace *ws_end, int gcol)
+/* Main entrypoint from the assembler */
+  j_decompress_ptr cinfo = NULL;
+  int              i, j;
+  blitter          result;
+  /* Check that the assembler has an adequate opinion of our workspace needs. */
+  tracef("wp=%x ws=%x ws_end=%x.\n" _ wp _ ws _ ws_end);
+  tracef("Size of assembler workspace: %i.\n" _ ((char*)ws) - ((char*)wp));
+  tracef("Size of C workspace: %i.\n" _ ((char*)ws_end) - ((char*)ws));
+  assert(ws_end > ws, ERROR_FATAL);
+  check_workspace(ws);
+  IFDEBUG(dump_asm_workspace(wp);)
+  ws->gcol = gcol & 7;
+  ws->masked = (gcol & 8) != 0;/* || PLOTMASK;*/
+  ws->mask1bpp = ws->masked & (((wp->save_mode) >> 27) != SpriteType_Old);
+  ws->odither = FALSE; /* Set more carefully later. */
+  tracef("gcol=%i (& 7 = %i)       %t32. GCOL action - 0 for plot, 1..7 for various others.\n" _ gcol _ gcol & 7);
+  tracef("masked=%s                %t32. whether to use mask.\n" _ whether(ws->masked));
+  tracef("1bpp mask=%s             %t32. whether mask is new format.\n" _ whether(ws->mask1bpp));
+#ifdef ASMjpeg
+  if (wp->is_it_jpeg)
+  {
+    sprite_header *s = wp->save_sprite;
+    int  *compress_id_word = (int*)((char*) s + s->image); /* The first word of the sprite data */
+    char *jpeg_data;
+    int   jpeg_data_size, jpeg_ws_size;
+    int   opt, err, xmax;
+    assert(compress_id_word[0] == -1, ERROR_BAD_JPEG);
+    tracef("This JPEG sprite was constructed by PutJPEGScaled\n");
+    jpeg_data = (char*)compress_id_word[1];
+    jpeg_data_size = compress_id_word[2];
+    jpeg_ws_size = compress_id_word[3];
+    check_jpeg_workspace(wp, jpeg_ws_size);
+    cinfo = wp->jpeg_info_ptr;
+    assert(wp->save_inlog2bpp == 5, ERROR_FATAL);          /* 32bpp source */
+    assert(!SOURCE_MASK, ERROR_FATAL);                     /* no mask */
+    tracef("JPEG, initial source coords are %i,%i.\n" _ wp->in_x _ wp->in_y);
+    if ((wp->save_mode >> 27) == 0)
+    {
+      /* Old-style mode - make sure no translation table present. */
+      wp->ColourTTR = 0;                                   /* >>>> mainly for JPEG on RO3 */
+      wp->trns_palette = 0;                                /* >>>> mainly for JPEG on RO3 */
+    }
+    /* Deduce the decompression options */
+    opt = jpeg_decompressor_opts(cinfo, wp);
+    /* Reverse scaling calculation */ 
+    xmax = wp->in_x + 2 + (wp->save_xsize * wp->save_xdiv) / (wp->save_xadd - wp->save_xdiv);
+    if (xmax < 0) xmax = s->width; /* set safe xmax if reverse scale calculation overflowed */
+    /* Initialise the decompressor */
+    err = jpeg_scan_file(cinfo, jpeg_data, jpeg_data_size, wp->in_x, xmax, -1, -1, opt);
+    assert(err == 0, ERROR_BAD_JPEG);
+    /* Check the decompressor agreed with proposed output options */
+    if (cinfo->error_argument1 & (jopt_OUTBPP_8 | jopt_OUTBPP_8YUV | jopt_OUTBPP_8GREY)) /* we asked for it, and we got it - 8bpp output pixels */
+    {
+      tracef("actually doing new shiny 8BPP plotting technique\n");
+      wp->save_inlog2bpp = wp->save_inlog2bpc = 3;
+      wp->ColourTTR = 0;
+    }
+    else
+    {
+      if (cinfo->error_argument1 & jopt_OUTBPP_16) /* we asked for it, and we got it - 16bpp output pixels */
+        wp->save_inlog2bpp = wp->save_inlog2bpc = 4;
+    }
+  }
+#ifdef DEBUG
+  /* Additional mask tracing */
+  if (PLOTMASK)
+  {
+    char *p;
+    int  *ecf = (int*) wp->save_ecflimit;
+    tracef("Sprite data:\n");
+    p = (char*) wp->save_inptr;
+    for (i = 0; i < 16; i++)
+    {
+      tracef("%x" _ p);
+      for (j = 0; j < 16; j++) tracef(" %2x" _ p[j]);
+      newline();
+      p -= wp->save_inoffset; /* convert from byte offset to int offset */
+    }
+    tracef("Mask data:\n");
+    p = (char*) (SOURCE_BPPMASK ? wp->save_maskinptr : (int) wp->save_inptr + wp->save_masko);
+    for (i = 0; i < 16; i++)
+    {
+      tracef("%x" _ p);
+      for (j = 0; j < 16; j++) tracef(" %2x" _ p[j]);
+      newline();
+      p -= wp->save_inoffset;
+    }
+    tracef("ECF pattern:\n");
+    for (i = 0; i <= 8; i++)
+      tracef("%x: %c %x %x\n" _ ecf + 2*i _ (ecf+2*i == (int*)wp->save_ecfptr ? '>' : ' ') _ ecf[2*i] _ ecf[2*i + 1]);
+  }
+  if (wp->cal_table)
+  {
+    calibration_table *t = wp->cal_table;
+    ws->cal_table_simple = t->idealblack == 0 && t->idealwhite == 0xffffff00 && t->postprocessSWI == 0;
+#ifdef DEBUG
+    tracef("Calibration table at 0x%x: version=%i idealblack=0x%x idealwhite=0x%x postprocessSWI=0x%x tablecount=%i simple=%s.\n"
+      _ t->version _ t->idealblack _ t->idealwhite _ t->postprocessSWI _ t->tablecount _ whether(ws->cal_table_simple));
+    for (i = 0; i < 256; i++) tracef(" %i" _ t->redtable[i]); newline();
+    if (t->tablecount == 3) for (i = 0; i < 256; i++) tracef(" %i" _ t->greentable[i]); newline();
+    if (t->tablecount == 3) for (i = 0; i < 256; i++) tracef(" %i" _ t->bluetable[i]); newline();
+    assert(wp->BPP == 32, ERROR_FATAL);              /* only to 32 bit dest */
+    assert(wp->save_inlog2bpp >= 4, ERROR_FATAL);    /* only from 16 or 32 bit source */
+    assert(!SOURCE_TABLE, ERROR_FATAL);              /* there isn't room for a calibration table and another table - they share r_table */
+    assert(t->version == 0, ERROR_FATAL);            /* check version number of lookup table */
+  }
+  /* Compute l2bpp from BPP of output - all we're given. */
+  i = 0;
+  j = wp->BPP;
+  while (j > 1)
+  {
+    j = j >> 1; i++;
+  }
+  ws->out_l2bpp = i;
+  if (wp->BPP != wp->BPC) i++;
+  ws->out_l2bpc = i;
+  /* If using a palette, ignore any translation table */
+  if (wp->trns_palette != 0) wp->ColourTTR = 0;
+  /* Simplify scale factors - >>> is this useful? Helps spot 1:1 scaling I guess? */
+  assert(wp->save_xadd > 0, ERROR_FATAL);
+  assert(wp->save_xdiv > 0, ERROR_FATAL);
+  assert(wp->save_ydiv > 0, ERROR_FATAL);
+  assert(wp->save_ydiv > 0, ERROR_FATAL);
+  while ((wp->save_xadd & 1) == 0 &&
+         (wp->save_xdiv & 1) == 0 &&
+         (wp->save_xcount & 1) == 0 &&
+         (wp->save_xmag & 1) == 0)
+  {
+    wp->save_xadd >>= 1; wp->save_xdiv >>= 1;
+    wp->save_xcount >>= 1; wp->save_xmag >>=1;
+  }
+  while ((wp->save_yadd & 1) == 0 &&
+         (wp->save_ydiv & 1) == 0 &&
+         (wp->save_ycount & 1) == 0)
+  {
+    wp->save_yadd >>= 1; wp->save_ydiv >>= 1;
+    wp->save_ycount >>= 1;
+  }
+  /* Look for unit translation table */
+#ifdef DEBUG
+  if (wp->ColourTTR != 0 && wp->BPP == (1<<wp->save_inlog2bpp)) /* only if table, and depth matches */
+  {
+    char *t = (char*) wp->ColourTTR;
+    BOOL  same = TRUE;
+    int   size = 1 << (1 << (wp->save_inlog2bpp == 5 ? 4 : wp->save_inlog2bpp));
+    if (wp->save_xsize * wp->save_ysize > size) /* Unless huge table for tiny sprite */
+    {
+      for (i = 0; i < size; i++)
+        if (t[i] != i) {same = FALSE; break;}
+      if (same)
+      {
+        tracef("Unit translation table - discarded\n");
+        wp->ColourTTR = 0;
+        assert(0, ERROR_FATAL); /* These are now zapped by the assembler, so they shouldn't ever turn up. */
+      }
+    }
+  }
+  /* Precise handling of double-pixel modes by the surrounding code is still unclear to me!
+   * When it enters this code bpc!=bpp can still be the case, but it seems that the actual
+   * value of bpc is best ignored, it has all been frigged into the scale factors. Avoid
+   * this issue for now, but note that we must set the values back afterwards because they
+   * can be reused on the next sprite plot, if the source sprite mode word is the same.
+   */
+  i = wp->BPC;
+  j = wp->save_inlog2bpc;
+  wp->BPC = wp->BPP;
+  wp->save_inlog2bpc = wp->save_inlog2bpp;
+  result = find_or_compile_code(wp, ws, cinfo);
+  wp->BPC = i;
+  wp->save_inlog2bpc = j;
+  return result;