Try to setup ALU architecture similar to real HW implementation.
Most algorithms are written for such environment. It will clean and speed up your code. In some cases I use asm for this but if you want to be not CPU dependent you can use this class of mine
ALU source in C++:
//---------------------------------------------------------------------------
//--- ALU32 class 2.01 ------------------------------------------------------
//---------------------------------------------------------------------------
#ifndef _ALU32_h
#define _ALU32_h
//---------------------------------------------------------------------------
//#define _ALU32_no_asm
//---------------------------------------------------------------------------
class ALU32
{
public:
BYTE cy;
ALU32() { cy=0; }
void sar(DWORD &c); // msb -> [msb...lsb] -> cy shift arithmetic right
void shl(DWORD &c); // cy <- [msb...lsb] <- 0 shift left
void shr(DWORD &c); // 0 -> [msb...lsb] -> cy shift right
void rcl(DWORD &c); // cy <- [msb...lsb] <- cy shift through carry left
void rcr(DWORD &c); // cy -> [msb...lsb] -> cy shift through carry lright
void inc(DWORD &c);
void dec(DWORD &c);
void add(DWORD &c,DWORD a,DWORD b);
void sub(DWORD &c,DWORD a,DWORD b);
void adc(DWORD &c,DWORD a,DWORD b);
void sbc(DWORD &c,DWORD a,DWORD b);
void mul(DWORD &ch,DWORD &cl,DWORD a,DWORD b); // (ch,cl) = a*b
void div(DWORD &c,DWORD &d,DWORD ah,DWORD al,DWORD b); // c = a/b d =a%b
};
//---------------------------------------------------------------------------
void ALU32::inc(DWORD &c) { if (c==0xFFFFFFFF) cy=1; else cy=0; c++; }
void ALU32::dec(DWORD &c) { if (c==0x00000000) cy=1; else cy=0; c--; }
//---------------------------------------------------------------------------
void ALU32::sar(DWORD &c)
{
cy=c&1;
c=((c>>1)&0x7FFFFFFF)|(c&0x80000000);
}
//---------------------------------------------------------------------------
void ALU32::shl(DWORD &c)
{
cy=c>>31;
c=(c<<1)&0xFFFFFFFE;
}
//---------------------------------------------------------------------------
void ALU32::shr(DWORD &c)
{
cy=c&1;
c=(c>>1)&0x7FFFFFFF;
}
//---------------------------------------------------------------------------
void ALU32::rcl(DWORD &c)
{
DWORD cy0=cy;
cy=c>>31;
c=((c<<1)&0xFFFFFFFE)|cy0;
}
//---------------------------------------------------------------------------
void ALU32::rcr(DWORD &c)
{
DWORD cy0=cy;
cy=c&1;
c=((c>>1)&0x7FFFFFFF)|(cy0<<31);
}
//---------------------------------------------------------------------------
void ALU32::add(DWORD &c,DWORD a,DWORD b)
{
c=a+b;
cy=DWORD(((a &1)+(b &1) )>> 1);
cy=DWORD(((a>>1)+(b>>1)+cy)>>31);
}
//---------------------------------------------------------------------------
void ALU32::sub(DWORD &c,DWORD a,DWORD b)
{
c=a-b;
if (a<b) cy=1; else cy=0;
}
//---------------------------------------------------------------------------
void ALU32::adc(DWORD &c,DWORD a,DWORD b)
{
c=a+b+cy;
cy=DWORD(((a &1)+(b &1)+cy)>> 1);
cy=DWORD(((a>>1)+(b>>1)+cy)>>31);
}
//---------------------------------------------------------------------------
void ALU32::sbc(DWORD &c,DWORD a,DWORD b)
{
c=a-b-cy;
if (cy) { if (a<=b) cy=1; else cy=0; }
else { if (a< b) cy=1; else cy=0; }
}
//---------------------------------------------------------------------------
void ALU32::mul(DWORD &ch,DWORD &cl,DWORD a,DWORD b)
{
#ifdef _ALU32_no_asm
const int _h=1; // this is MSW,LSW order platform dependent So swap 0,1 if your platform is different
const int _l=0;
union _u
{
DWORD u32;
WORD u16[2];
} u;
DWORD al,ah,bl,bh;
DWORD c0,c1,c2;
// separate 2^16 base digits
u.u32=a; al=u.u16[_l]; ah=u.u16[_h];
u.u32=b; bl=u.u16[_l]; bh=u.u16[_h];
// multiplication (al+ah<<16)*(bl+bh<<16) = al*bl + al*bh<<16 + ah*bl<<16 + ah*bh<<32
c0=(al*bl);
add(c1,al*bh,ah*bl);
c2=(ah*bh)+(cy<<16);
// add subresults
add(c0,c0,(c1<<16)&0xFFFF0000); c1=((c1>>16)&0x0000FFFF)+cy;
add(c1,c1,c2);
// construct result from (c3,c2,c1,c0)
ch=c1;
cl=c0;
#else
DWORD _a,_b,_cl,_ch;
_a=a;
_b=b;
asm {
mov eax,_a
mov ebx,_b
mul ebx // H(edx),L(eax) = eax * ebx
mov _cl,eax
mov _ch,edx
}
cl=_cl;
ch=_ch;
#endif
}
//---------------------------------------------------------------------------
void ALU32::div(DWORD &c,DWORD &d,DWORD ah,DWORD al,DWORD b)
{
#ifdef _ALU32_no_asm
DWORD ch,cl,bh,bl,h,l,mh,ml;
int e;
// edge cases
if (!b ){ c=0xFFFFFFFF; d=0xFFFFFFFF; cy=1; return; }
if (!ah){ c=al/b; d=al%b; cy=0; return; }
// align a,b for binary long division m is the shifted mask of b lsb
for (bl=b,bh=0,mh=0,ml=1;bh<0x80000000;)
{
e=0; if (ah>bh) e=+1; // e = cmp a,b {-1,0,+1}
else if (ah<bh) e=-1;
else if (al>bl) e=+1;
else if (al<bl) e=-1;
if (e<=0) break; // a<=b ?
shl(bl); rcl(bh); // b<<=1
shl(ml); rcl(mh); // m<<=1
}
// binary long division
for (ch=0,cl=0;;)
{
sub(l,al,bl); // a-b
sbc(h,ah,bh);
if (cy) // a<b ?
{
if (ml==1) break;
shr(mh); rcr(ml); // m>>=1
shr(bh); rcr(bl); // b>>=1
continue;
}
al=l; ah=h; // a>=b ?
add(cl,cl,ml); // c+=m
adc(ch,ch,mh);
}
cy=0; c=cl; d=al;
if ((ch)||(ah)) cy=1; // overflow
#else
DWORD _al,_ah,_b,_c,_d;
_al=al;
_ah=ah;
_b=b;
asm {
mov eax,_al
mov edx,_ah
mov ebx,_b
div ebx
mov _c,eax // eax = H(edx),L(eax) / ebx
mov _d,edx // edx = H(edx),L(eax) % ebx
}
c=_c;
d=_d;
#endif
}
//---------------------------------------------------------------------------
#endif
//---------------------------------------------------------------------------
mul
and div
are switchable between fast CPU assembly and slower C++ implementation with the #define _ALU32_no_asm
DWORD
is 32 bit unsigned int
and can be defined like typedef unsigned __int32 DWORD;