/* fmul.c : produce efficient code to do approximate multiplication * of an integer by a constant less than 1. * * (c) 1998 Gareth McCaughan * You may use and distribute copies of this code, modified or * not, provided that (1) this copyright message remains unchanged * and (2) the source code contains a description of any changes * you have made to it. You may not distribute it in compiled form * unless you also distribute the source code. You may include it * in other software; in that case, you must still include the * source for "fmulc" or whatever portions of it are used, but * you need not include source for material you have written. * There is no restriction on your use of the output from fmulc; * you may do anything you like with that. * * NB that this doesn't necessarily produce anything like optimal * code for exact divrem; it's not actually optimising the right * things for that. It should, however, do an adequate job. * * Very quick usage instructions: * * fmulc greedy 2/13 * * produce code to multiply by about 2/13, using the "greedy" * approach. * * fmulc .12345 * * produce code to multiply by about 0.12345, doing 1-ply * search with "greedy completion": i.e., to evaluate a * leaf node in the search we finish the computation off * in greedy mode, and see how well we do. * * fmulc .12345 slow * * ditto, but with "1-ply completion": like greedy * completion except that we finish the computation * at each leaf node in normal mode. * * fmulc expensive under 1/13 * * produce code to multiply by about 1/13, doing 2-ply search * (plus greedy completion) and forcing the result to be an * underestimate. * * fmulc expensive under 1/13 delay * * ditto, but tries to decrease the maximum error by making * the last instruction a shift. * * fmulc expensive .12345 5 * * produce code to multiply by about 0.12345, doing 2-ply * search plus greedy completion, and not producing more * than 5 instructions of output. * * fmulc expensive .12345 5 arm * * ditto, but produce ARM assembler instead of C. * * fmulc .12345 source=x dest=y * * Produce code with variables called x and y. * * fmulc .12345 arm source=x dest=y * * Produce code with registers called x and y. * * The relationship between "slow" and "expensive": * "slow" equals "expensive" plus better static evaluation; * "slow" plus "expensive" is basically doing 3-ply search. * Hence, "slow" is somewhat slower than "expensive", and * the two together are painful. * * It is not guaranteed that "slow" is better than "expensive", * or that either is better than "normal", in the sense of producing * smaller overall error estimates. They are trying to optimise * something different (and cheaper!). My limited experience is * that "slow" and "expensive" are usually better than "normal", * and that the relation between the two varies. When you're * greatly restricting the number of instructions produced, though, * slower usually *does* mean better. * * The "greedy", "expensive", "under", "over", "delay" and "arm" * keywords can appear anywhere on the command line. You must * give a multiplier, which can be either a single decimal * number or two separated by "/". You may also give a maximum * number of instructions, which must come after the multiplier. * There's also a "c" keyword, which undoes "arm". It's only there * for completeness; you don't need it. The "source=x" and "dest=x" * things can also occur anywhere on the line. You can abbreviate * them to "src" and "dst" if you want to. * * If you want to use this for making exact divrem code, you * need to pay attention to the error estimates. The error in * the result is bounded by the final "overall error" estimate; * its sign is uncertain unless you have ordered fmulc to * overestimate or underestimate. * * Change log: * * 1.00 Initial release. * 1.10 Cosmetic cleanups. * Code rearranged and commented better. * User-defined source and destination names. * "Slow" feature. * Timing. * Tweakage. */ #define VERSION "1.10" #include #include #include #include #include /* If the preprocessor symbol is defined then we'll use * the sort routine from gjmlib; otherwise, the system-provided * one. */ #ifdef G extern void qsortG(void*, size_t, size_t, int (*)(const void *, const void *)); #endif /* A decent |qsort()| routine. On most systems you can just use * the |qsort| from the standard C library, but RISC OS's library * has a disastrously broken |qsort|. */ #ifdef G #define Qsort qsortG #else #define Qsort qsort #endif /* The argument type for comparator functions passed to the * above |qsort|. Usually this is |const void *|. */ typedef const void * Qsort_ptr; typedef unsigned long u32; typedef signed long i32; #define DIFF(x,y) ((x)<(y) ? (y)-(x) : (x)-(y)) #define F(x) ((x)*ldexp(1.0,-32)) /* A single operation is represented by a type and a shift amount. * The different shift types are best explained by the |print_shift| * routine. * The type and the amount are combined into a single integer * by the |SHIFT| macro, and extracted by |TYPE| and |NUM|. */ enum { NONE, sr1e, sr1s, sr1p, sr1m, sr1r, srXp, srXm, srXs, srXe }; #define SHIFT(type,num) (((type)<<8)+(num)) #define TYPE(shift) ((shift)>>8) #define NUM(shift) ((shift)&0xFF) /* We can produce output as C code or as ARM assembler. * If |armp!=0| we do ARM, else C. */ static int armp=0; /* |bc(0)| starts a comment at the start of the line. * |bc(1)| starts a comment in the middle of the line. * |cc()| continues a comment on the next line. * |ec()| ends a comment and starts a new line. */ static int c_continued=0; static void bc(int tabp) { if (tabp) putchar('\t'); printf(armp ? ";; " : "/* "); c_continued=0; } static void cc(void) { printf(armp ? "\n;; " : "\n * "); c_continued=1; } static void ec(void) { if (!armp && c_continued) printf("\n"); if (!armp) printf(" */"); printf("\n"); } /* The usual utility function... */ static void * xmalloc(size_t n) { void * p = malloc(n); if (!p) { fprintf(stderr, "! Out of memory.\n"); exit(1); } return p; } /* The names of the source and destination variables * or registers are in |src|, |dst|; they have been * padded to have equal length. |spc| contains only * spaces and has the same length as |src| and |dst|. * They default to |src| and |dst| in C code, and to * |rSrc| and |rDst| in ARM code. */ static const char *src, *dst, *spc; /* Set up |src|, |dst|, |spc| given un-padded values * for |src| and |dst|. */ static void init_names(const char * src0, const char * dst0) { size_t ls=strlen(src0), ld=strlen(dst0); size_t l = (ls>ld) ? ls : ld; char * s = xmalloc(3*(l+1)); memset(s, ' ', 3*(l+1)); src=s; memcpy(s, src0, ls); s[l]=0; dst=s+l+1; memcpy(s+l+1, dst0, ld); s[2*l+1]=0; spc=s+2*(l+1); s[3*l+2]=0; } static void print_shift(int shift) { int n = NUM(shift); if (armp) { printf("\t"); switch(TYPE(shift)) { case NONE: printf(";; PASS\t%s %s %s ", spc, spc, spc); break; case sr1e: printf("MOV\t%s,%s,%s LSR#%2d", dst, src, spc, n); break; case sr1s: printf("SUB\t%s,%s,%s,LSR#%2d", dst, src, src, n); break; case sr1p: printf("ADD\t%s,%s,%s,LSR#%2d", dst, dst, src, n); break; case sr1m: printf("SUB\t%s,%s,%s,LSR#%2d", dst, dst, src, n); break; case sr1r: printf("RSB\t%s,%s,%s,LSR#%2d", dst, dst, src, n); break; case srXp: printf("ADD\t%s,%s,%s,LSR#%2d", dst, dst, dst, n); break; case srXm: printf("SUB\t%s,%s,%s,LSR#%2d", dst, dst, dst, n); break; case srXs: printf("SUB\t%s,%s,%s,LSR#%2d", dst, src, dst, n); break; case srXe: printf("MOV\t%s,%s,%s LSR#%2d", dst, dst, spc, n); break; default: printf(";; ZOG(%08X)", shift); } } else { if (TYPE(shift)!=NONE) printf("%s ", dst); switch(TYPE(shift)) { case NONE: printf("/*PASS*/ "); break; case sr1e: printf(" = %s>>%2d; %s", src, n, spc); break; case sr1s: printf(" = %s-(%s>>%2d);", src, src, n); break; case sr1p: printf("+= %s>>%2d; %s", src, n, spc); break; case sr1m: printf("-= %s>>%2d; %s", src, n, spc); break; case sr1r: printf(" = (%s>>%2d)-%s;", src, n, dst); break; case srXp: printf("+= %s>>%2d; %s", dst, n, spc); break; case srXm: printf("-= %s>>%2d; %s", dst, n, spc); break; case srXs: printf(" = %s-(%s>>%2d);", src, dst, n); break; case srXe: printf(" = %s>>%2d; %s", dst, n, spc); break; default: printf("/* %08X */ ", shift); } } } /* Update an error bound to take account of a single action. * What this is measuring is the maximum total difference * between what we calculate and the actual product of * our initial number and the "effective multiplier". * By "effective multiplier", I mean: what we would be * multiplying by exactly if shift operations didn't * throw away bits that fall off the LS end. * The overall error is the sum of (1) the error calculated * by successive applications of this function, and (2) * the error resulting from the fact that the effective * multiplier doesn't (well, needn't) equal the multiplier * we actually want. * * This is overly pessimistic, especially when we have a * mixture of underestimating and overestimating actions. * We'd get better results from keeping track of the * possible positive and negative errors. Tough. */ static double update_error(int shift, double error) { int n = NUM(shift); switch(TYPE(shift)) { case NONE: return error; case sr1e: return n ? 1-ldexp(1.0,-n) : 0; case sr1s: return n ? 1-ldexp(1.0,-n) : 0; case sr1p: return error + (n ? 1-ldexp(1.0,-n) : 0); case sr1m: return error + (n ? 1-ldexp(1.0,-n) : 0); case sr1r: return error + (n ? 1-ldexp(1.0,-n) : 0); case srXp: return error*(1+ldexp(1,-n)) + (n ? 1-ldexp(1.0,-n) : 0); case srXm: return error*(1-ldexp(1,-n)) + (n ? 1-ldexp(1.0,-n) : 0); case srXs: return ldexp(error,-n) + (n ? 1-ldexp(1.0,-n) : 0); case srXe: return ldexp(error,-n) + (n ? 1-ldexp(1.0,-n) : 0); default: fprintf(stderr,"Uh-oh...\n"); return error; } } /* Update the effective multiplier to take account of a * new action. (See above for the definition of "effective * multiplier".) */ static double update_multiplier(int shift, double mult) { int n = NUM(shift); switch(TYPE(shift)) { case NONE: return mult; case sr1e: return ldexp(1.0,-n); case sr1s: return 1.0-ldexp(1.0,-n); case sr1p: return mult+ldexp(1.0,-n); case sr1m: return mult-ldexp(1.0,-n); case sr1r: return ldexp(1.0,-n)-mult; case srXp: return mult+ldexp(mult,-n); case srXm: return mult-ldexp(mult,-n); case srXs: return 1.0-ldexp(mult,-n); case srXe: return ldexp(mult,-n); default: fprintf(stderr,"Uh-oh...\n"); return mult; } } /* All our searching routines really need to return three * values (at least): the shift operation they chose, the * new value of the destination variable/register, and the * resulting error estimate (i.e., the resulting difference * between actual and desired multipliers; usually, the * resulting difference after running the greedy algorithm * for as many iterations as available). * * Perhaps suboptimally, we return two of these three values * in global variables. */ static int last_action; static u32 last_result; /* For use in |greedy()|: update our best-yet-seen variables. */ #define U(type) { y1=y; delta=h; best=SHIFT(type,i); } /* These macros all update iff (1) this action gives better results * than we've yet seen, and (2) any constraint is satisfied. * |UPDATE_UE| is for guaranteed underestimates; * |UPDATE_OE| for guaranteed overestimates; * |UPDATE_XX| for calculations that might produce either. * |UPDATE_OK| for under/over-estimates that we know need only the static * check. * NB that these all assume we've done the preliminary checks * of |constraint| against what's statically known about the * action: for instance, |UPDATE_XX| need do no constraint check. */ #define U_(x) { u32 h=DIFF(target,y); x } #define UPDATE_UE(type) U_(if (h=target)) U(type)) #define UPDATE_XX(type) U_(if (h0| then only allow overestimates. * If |constraint<0| then only allow underestimates. * If |constraint==0| then allow either. */ static i32 greedy(u32 target, u32 current, int iter, int constraint) { int best=NONE; int i; u32 y1=current, delta=DIFF(target,current); if (target==current) { last_action = NONE; last_result = target; return -(32-iter); } for (i=1; i<32; ++i) { if (iter==0) { if (constraint<=0) { /* t = n>>i. Underestimate. */ u32 y = 1ul << (32-i); UPDATE_UE(sr1e) } if (constraint>=0) { /* t = n-(n>>i). Overestimate. */ u32 y = - (1ul << (32-i)); UPDATE_OE(sr1s) } } else { if (constraint<=0) { /* t = t+(n>>i). Underestimate. */ u32 y = current + (1ul << (32-i)); if (y>current) UPDATE_UE(sr1p) } if (constraint<=0) { /* t = t+(t>>i). Underestimate. */ u32 y = current + (current>>i); if (y>current) UPDATE_UE(srXp) } if (constraint<=0) { /* t = t>>i. Underestimate. */ u32 y = current>>i; UPDATE_OK(srXe) } if (constraint>=0) { /* t = t-(n>>1). Overestimate. */ u32 y = current - (1ul << (32-i)); if (y>i). May be over or under. */ u32 y = current - (current>>i); UPDATE_XX(srXm) } if (constraint==0) { /* t = (n>>i)-t. May be over or under. */ u32 y = (1ul << (32-i)) - current; if (y < (1ul << (32-i))) UPDATE_XX(sr1r) } if (constraint==0) { /* t = n-(t>>i). May be over or under. */ u32 y = - (current>>i); UPDATE_XX(srXs) } } } /* one more: sr1r/srXs with i=0. May be over or under. */ if (constraint==0 && iter) { u32 y = - current; u32 h = DIFF(target,y); if (h=lim|, don't touch |last*|. * If |slowp!=0| then instead of doing greedies do 1-ply searches. */ static i32 greedy_tail(u32 target, u32 current, int iter, int constraint, int lim ) { int la=-1; u32 lr=0; i32 delta; if (target==current) { last_action = NONE; last_result = current; return -(32-iter); } if (iter>=lim) { return DIFF(target,current); } if (slowp) { slowp=0; do { delta = search(target, current, iter, 1, constraint, lim); if (la<0) { la=last_action; lr=last_result; } /* It's very important *not* to have |if (delta<0) break;| * as the next line; that gives utterly wrong results, * because it only works in the |greedy()| case because * |greedy()| only returns something negative when it's * reached the end of the line. */ if (last_action==NONE) break; current = last_result; ++iter; } while (iterdelta - ((Move*)b)->delta); if (d) return d; return ((Move*)a)->action - ((Move*)b)->action; } /* Choose an action by exhaustive search to |depth| ply, * with the static evaluation given by |greedy_tail|. * In other words, the quality of a leaf node is determined * by how close to |target| one can get it in the remaining * iterations by using the greedy algorithm. * Everything gets truncated at |lim| iterations. */ static i32 search(u32 target, u32 current, int iter, int depth, int constraint, int lim ) { Move opts[32*7], *op=opts, *best; int i; i32 alpha; int ba; u32 br; alpha = greedy_tail(target, current, iter, constraint, lim); if (depth==0 || iter==lim-1) return alpha; /* Early termination possible? */ if (alpha <= -(31-iter)) return alpha; ba=last_action; br=last_result; /* Enumerate possible actions. */ #define ADD(type) \ op->action = SHIFT(type, i);\ op->result = y;\ op->delta = DIFF(target,y);\ ++op for (i=1; i<32; ++i) { if (iter==0) { /* t = n>>i */ if (constraint<=0) { u32 y = 1ul << (32-i); if (!constraint || y<=target) { ADD(sr1e); } } /* t = n-(n>>i) */ if (constraint>=0) { u32 y = - (1ul << (32-i)); if (!constraint || y>=target) { ADD(sr1s); } } } else { if (constraint<=0) { /* t += n>>i */ { u32 y = current + (1ul << (32-i)); if (y>current && (!constraint || y<=target)) { ADD(sr1p); } } /* t += t>>i */ { u32 y = current + (current >> i); if (y>current && (!constraint || y<=target)) { ADD(srXp); } } /* t = t>>i */ { u32 y = current>>i; ADD(srXe); } } if (constraint>=0) { /* t -= n>>i */ { u32 y = current - (1ul << (32-i)); if (y=target)) { ADD(sr1m); } } } if (constraint==0) { /* t -= t>>i */ { u32 y = current - (current >> i); if (y>i) - t */ { u32 y = (1ul<<(32-i)) - current; if (y < (1ul << (32-i))) { ADD(sr1r); } } } } } if (constraint==0 && iter) { for (i=0; i<32; ++i) { /* t = n - (t>>i) */ { u32 y = - (current>>i); ADD(srXs); } } } /* Sort into a plausible order. */ Qsort(opts, op-opts, sizeof(Move), comparator); /* Try each. */ ++iter; if (--depth==0) { /* In this case the recursive call will just go straight * to |greedy_tail|, so we do that directly. */ Move * op2; for (best=0, op2=opts; op2!=op; ++op2) { i32 a = greedy_tail(target, op2->result, iter, constraint, lim); if (aresult, iter, depth, constraint, lim); if (aaction; last_result = best->result; } else { last_action = ba; last_result = br; } return alpha; } /* For our overall error estimate, we need to know exactly what * we really multiplied by. So the driver function |do_it| * needs to return two pieces of information: an error estimate * and a multiplier value. The latter goes in this global variable * (ugh). */ static double saved_multiplier; /* Once upon a time we had two top-level driver functions, * |do_greedy| and |do_search|, consisting of almost exactly * the same code. Now we use a single function; to smooth * the way for that, here's a wrapper for |greedy| that takes * the same args as |search|. */ static i32 greedy_x(u32 t, u32 cu, int i, int d, int co, int l) { d=d; l=l; /* pacify the compiler */ return greedy(t,cu,i,co); } /* Apply greedy or search, for as many iterations as we * need to. */ static double do_it(u32 target, u32 current, int depth, int constraint, int lim, int ns, i32 (*func)(u32,u32,int,int,int,int) ) { int iter=0; double error=0; double mult=0; if (ns) --lim; while (iter> ns; print_shift(shift); error = update_error(shift, error); mult = update_multiplier(shift, mult); bc(1); printf("%12.12f, err <= %12.12f", F(x), error); ec(); } error += 0xFFFFFFFF*fabs(mult-real_target); bc(armp); printf("Overall error <= %12.12f", error); ec(); printf("\n"); } int main(int ac, char *av[]) { double target=0; int n_iters=32; int greedyp=0, expensivep=0, delayp=0; int constraint=0; int target_given=0, iters_given=0; const char *src0="src", *dst0="dst"; time_t t0,t1; int timingp=0; i32 tweak=0; while (++av,--ac>0) { if (!strcmp(*av,"greedy")) { greedyp=1; continue; } if (!strcmp(*av,"expensive" )) { expensivep=1; continue; } if (!strcmp(*av,"normal")) { greedyp=0; expensivep=0; continue; } if (!strcmp(*av,"slow")) { slowp=1; continue; } if (!strcmp(*av,"over")) { constraint=1; continue; } if (!strcmp(*av,"under")) { constraint=-1; continue; } if (!strcmp(*av,"delay")) { delayp=1; continue; } if (!strcmp(*av,"arm")) { armp=1; continue; } if (!strcmp(*av,"c")) { armp=0; continue; } if (!strcmp(*av,"timing")) { timingp=1; continue; } if (!strncmp(*av,"src=",4)) { src0=*av+4; continue; } if (!strncmp(*av,"source=",7)) { src0=*av+7; continue; } if (!strncmp(*av,"dst=",4)) { dst0=*av+4; continue; } if (!strncmp(*av,"dest=",5)) { dst0=*av+5; continue; } if (!strncmp(*av,"tweak=",6)) { char * end; tweak=strtol(*av+6,&end,10); if (*end) goto lUsageMessage; continue; } if (target_given) { if (iters_given) { lUsageMessage: fprintf(stderr, "This is fmulc, version " VERSION ", by gjm.\n" "Usage: fmulc []\n" "Options:\n" " greedy always do what naively looks best (quick)\n" " expensive search 2-ply instead of 1-ply (slow)\n" " normal [default] search 1-ply\n" " slow do static eval with 1-ply, not greedy, steps\n" " over produce an overestimate\n" " under produce an underestimate\n" " delay delay final right-shift to the end\n" " arm produce ARM assembler, not C\n" " c [default] produce C, not ARM assembler\n" " timing write timing info to stderr\n" " src= call the source reg/var \n" " dst= call the destination reg/var \n" " tweak= change the target bit pattern by \n" "The target must be between 0 and 1 exclusive;\n" "you can specify it as num/denom if you want.\n" " is somewhat more expensive than ;\n" " plus is painful.\n"); return 1; } n_iters = atoi(*av); continue; } { char * end; if (strchr(*av,'/')) { char * cp = strchr(*av,'/'); double x,y; *cp++=0; x=strtod(*av,&end); if (*end) goto lUsageMessage; y=strtod(cp, &end); if (*end) goto lUsageMessage; target = x/y; } else { target=strtod(*av,&end); if (*end) goto lUsageMessage; } } target_given=1; } if (!target_given || target<=0 || target>=1) goto lUsageMessage; if (greedyp && (slowp || expensivep)) goto lUsageMessage; bc(0); printf("%s := about %.12g*%s, using <= %d %s steps%s:", dst0, target, src0, n_iters, greedyp ? "greedy" : expensivep ? "2-ply" : "1-ply", greedyp ? "" : slowp ? " (1-ply tails)" : " (greedy tails)"); if (tweak) { cc(); printf("(Multiplier to be tweaked by %ld unit%s)", tweak, (tweak==1 || tweak==-1) ? "" : "s"); } if (constraint) { cc(); printf("(Required to produce a %sestimate)", constraint>0 ? "over" : "under"); } if (delayp) { cc(); printf("(Maybe keeping a right-shift until the end)"); } ec(); printf("\n"); init_names(src0, dst0); t0=time(0); { int ns=0; double err; double t0 = target; if (delayp) { while (target<0.5) { ++ns; target=ldexp(target,1); } } { double t1 = target*ldexp(1.0,32); u32 t; if (constraint<0) t=(u32)floor(t1); else if (constraint>0) t=(u32)ceil(t1); else t=(u32)floor(t1+0.5); err = do_it(t+tweak, 0, expensivep ? 2 : 1, constraint, n_iters, ns, greedyp ? greedy_x : search); finish_up(err, t0, ns); } } t1=time(0); if (timingp) { double dt = difftime(t1,t0); fprintf(stderr, "[That took about "); if (dt>=120) { double mins=floor(dt/60); dt -= 60*mins; fprintf(stderr, "%g minutes and ", mins); } fprintf(stderr, "%g second%s.]\n", dt, dt==1.0 ? "" : "s"); } return 0; }