/* dct1d_test.c created mer jan 2 13:55:12 GMT 2002 by Yann Guidon Modified for Codevision by Bruce Land, BRL4@cornell.edu */ #include #include #include #define begin { #define end } #define int2fix(a) (((int)(a))<<8) #define float2fix(a) ((int)((a)*256.0)) #define fix2float(a) ((float)(a)/256.0) #define pi 3.14159 int m1, m2, m3, m4 ; /* 8 sample twiddle factors -- needed for 32 points */ int c1,c3,c5,c7,c9,c11,c13,c15; //16 sample factors -- needed for 32 points int c17,c19,c21,c23,c25,c27,c29,c31; //32 sample factors int a[8]; /* the 8 sample inputs -- needed for 32 points*/ int S[8]; /* the 8 sample outputs -- needed for 32 points */ int a16[16]; /* the 16 sample inputs -- needed for 32 points*/ int S16[16]; /* the 16 sample outputs -- needed for 32 points*/ int a32[32]; /* the 16 sample inputs */ int S32[32]; /* the 16 sample outputs */ //==Fast fixed multiply================================= #pragma warn- int multfix(int a, int b) begin #asm push r20 push r21 LDD R22,Y+2 ;load a LDD R23,Y+3 LD R20,Y ;load b LDD R21,Y+1 muls r23, r21 ; [signed)ah * [signed)bh mov r31, r0 ; mul r22, r20 ; al * bl mov r30, r1 ; ;mov r16, r0 mulsu r23, r20 ; [signed)ah * bl add r30, r0 ; adc r31, r1 ; mulsu r21, r22 ; [signed)bh * al add r30, r0 ; adc r31, r1 ; pop r21 pop r20 #endasm end #pragma warn+ //================================================================== void DCT1D_02(void) begin /* Modified for Codevision by Bruce Land BRL4@cornell.edu DCT1D_02.c version mar jan 1 08:45:57 GMT 2002 whygee@f-cpu.org 1* 8-bin DCT for a "baseline" JPG compressor. originally cut and pasted from : sbcci_DCT2D.pdf "Pipelined Fast 2-D DCT Architecture for JPEG Image Compression" Luciano Volcan Agostini Ivan Saraiva Silva Sergio Bampi */ int b0, b1, b2, b3, b4, b5, b6, b7; int c0, c1, c2, c3, c4, c5, c6; int d0, d1, d3, d4; int e2, e3, e4, e6, e7; int f2, f3, f4, f5, f6, f7; /* Step 1 */ b0 = a[0] + a[7]; b1 = a[1] + a[6]; b2 = a[3] - a[4]; /* corrected */ b3 = a[1] - a[6]; b4 = a[2] + a[5]; b5 = a[3] + a[4]; b6 = a[2] - a[5]; b7 = a[0] - a[7]; /***/ /* Step 2 */ c0 = b0 + b5; c1 = b1 - b4; c2 = b2 + b6; /***/ c3 = b1 + b4; c4 = b0 - b5; c5 = b3 + b7; /***/ c6 = b3 + b6; /***/ /* Step 3 */ d0 = c0 + c3; /***/ d1 = c0 - c3; /***/ d3 = c1 + c4; d4 = c2 - c5; /* Step 4 */ e2 = multfix(m3,c2); /*c2*/ e3 = multfix(m1,c6); /*c6*/ e4 = multfix(m4,c5); /*c5*/ e6 = multfix(m1,d3); e7 = multfix(m2,d4); /* Step 5 */ f2 = c4 + e6; /*c4*/ f3 = c4 - e6; /*c4*/ f4 = e3 + b7; /*b7*/ f5 = b7 - e3; /*b7*/ f6 = e2 + e7; f7 = e4 + e7; /* Step 6 */ S[0] = d0; /*d0*/ S[1] = f4 + f7; S[2] = f2; S[3] = f5 - f6; S[4] = d1; /*d1*/ S[5] = f5 + f6; S[6] = f3; S[7] = f4 - f7; end //========================================================= // From the algorithm to combine 2 8-point DCTs given in // KR Rao and P Yip: // "Discrete Cosine Transform", Academic Press 1990 // pages 60-61 void DCT1D_16(void) begin //segment 16 samples into first 8 //then call DCT1D_02 //and assign outputs to correct frequencies a[0] = a16[0] + a16[15]; a[1] = a16[1] + a16[14]; a[2] = a16[2] + a16[13]; a[3] = a16[3] + a16[12]; a[4] = a16[4] + a16[11]; a[5] = a16[5] + a16[10]; a[6] = a16[6] + a16[9]; a[7] = a16[7] + a16[8]; DCT1D_02(); S16[0] = S[0]; S16[2] = S[1]; S16[4] = S[2]; S16[6] = S[3]; S16[8] = S[4]; S16[10] = S[5]; S16[12] = S[6]; S16[14] = S[7]; //segment 16 samples into second 8 //premult by factors //then call DCT1D_02 //and assign outputs to correct frequencies a[0] = multfix(c1, (a16[0] - a16[15])); a[1] = multfix(c3, (a16[1] - a16[14])); a[2] = multfix(c5, (a16[2] - a16[13])); a[3] = multfix(c7, (a16[3] - a16[12])); a[4] = multfix(c9, (a16[4] - a16[11])); a[5] = multfix(c11, (a16[5] - a16[10])); a[6] = multfix(c13, (a16[6] - a16[9])); a[7] = multfix(c15, (a16[7] - a16[8])); DCT1D_02(); S16[1] = S[0] + S[1]; S16[3] = S[1] + S[2]; S16[5] = S[2] + S[3]; S16[7] = S[3] + S[4]; S16[9] = S[4] + S[5]; S16[11] = S[5] + S[6]; S16[13] = S[6] + S[7]; S16[15] = S[7] ; end //========================================================= // From the algorithm to combine 2 16-point DCTs given in // KR Rao and P Yip: // "Discrete Cosine Transform", Academic Press 1990 // pages 60-61 void DCT1D_32(void) begin //segment 16 samples into first 16 //then call DCT1D_16 //and assign outputs to correct frequencies a16[0] = a32[0] + a32[31]; a16[1] = a32[1] + a32[30]; a16[2] = a32[2] + a32[29]; a16[3] = a32[3] + a32[28]; a16[4] = a32[4] + a32[27]; a16[5] = a32[5] + a32[26]; a16[6] = a32[6] + a32[25]; a16[7] = a32[7] + a32[24]; a16[8] = a32[8] + a32[23]; a16[9] = a32[9] + a32[22]; a16[10] = a32[10] + a32[21]; a16[11] = a32[11] + a32[20]; a16[12] = a32[12] + a32[19]; a16[13] = a32[13] + a32[18]; a16[14] = a32[14] + a32[17]; a16[15] = a32[15] + a32[16]; DCT1D_16(); S32[0] = S16[0]; S32[2] = S16[1]; S32[4] = S16[2]; S32[6] = S16[3]; S32[8] = S16[4]; S32[10] = S16[5]; S32[12] = S16[6]; S32[14] = S16[7]; S32[16] = S16[8]; S32[18] = S16[9]; S32[20] = S16[10]; S32[22] = S16[11]; S32[24] = S16[12]; S32[26] = S16[13]; S32[28] = S16[14]; S32[30] = S16[15]; //segment 16 samples into second 16 //premult by factors //then call DCT1D_16 //and assign outputs to correct frequencies a16[0] = multfix(c1, (a32[0] - a32[31])); a16[1] = multfix(c3, (a32[1] - a32[30])); a16[2] = multfix(c5, (a32[2] - a32[29])); a16[3] = multfix(c7, (a32[3] - a32[28])); a16[4] = multfix(c9, (a32[4] - a32[27])); a16[5] = multfix(c11, (a32[5] - a32[26])); a16[6] = multfix(c13, (a32[6] - a32[25])); a16[7] = multfix(c15, (a32[7] - a32[24])); a16[8] = multfix(c17, (a32[8] - a32[23])); a16[9] = multfix(c19, (a32[9] - a32[22])); a16[10] = multfix(c21, (a32[10] - a32[21])); a16[11] = multfix(c23, (a32[11] - a32[20])); a16[12] = multfix(c25, (a32[12] - a32[19])); a16[13] = multfix(c27, (a32[13] - a32[18])); a16[14] = multfix(c29, (a32[14] - a32[17])); a16[15] = multfix(c31, (a32[15] - a32[16])); DCT1D_16(); S32[1] = S16[0] + S16[1]; S32[3] = S16[1] + S16[2]; S32[5] = S16[2] + S16[3]; S32[7] = S16[3] + S16[4]; S32[9] = S16[4] + S16[5]; S32[11] = S16[5] + S16[6]; S32[13] = S16[6] + S16[7]; S32[15] = S16[7] + S16[8]; S32[17] = S16[8] + S16[9]; S32[19] = S16[9] + S16[10]; S32[21] = S16[10] + S16[11]; S32[23] = S16[11] + S16[12]; S32[25] = S16[12] + S16[13]; S32[27] = S16[13] + S16[14]; S32[29] = S16[14] + S16[15]; S32[31] = S16[15] ; end //========================================================= void main(void) begin int i; //serial setop for debugging using printf, etc. UCSRB = 0x18 ; UBRRL = 103 ; putsf("\r\nStarting...\r\n"); //constants for 8 sample DCT [also used by 16 sample) m1 = float2fix(cos(4*pi/16)); m2 = float2fix(cos(6*pi/16)); m3 = float2fix(cos(2*pi/16) - cos(6*pi/16)); m4 = float2fix(cos(2*pi/16) + cos(6*pi/16)); // constants for 16 bit DCT c1 = float2fix(0.5*cos(1.0*pi/32.0)); c3 = float2fix(0.5*cos(3.0*pi/32.0)); c5 = float2fix(0.5*cos(5.0*pi/32.0)); c7 = float2fix(0.5*cos(7.0*pi/32.0)); c9 = float2fix(0.5*cos(9.0*pi/32.0)); c11 = float2fix(0.5*cos(11.0*pi/32.0)); c13 = float2fix(0.5*cos(13.0*pi/32.0)); c15 = float2fix(0.5*cos(15.0*pi/32.0)); c17 = float2fix(0.5*cos(17.0*pi/32.0)); c19 = float2fix(0.5*cos(19.0*pi/32.0)); c21= float2fix(0.5*cos(21.0*pi/32.0)); c23= float2fix(0.5*cos(23.0*pi/32.0)); c25= float2fix(0.5*cos(25.0*pi/32.0)); c27 = float2fix(0.5*cos(27.0*pi/32.0)); c29 = float2fix(0.5*cos(29.0*pi/32.0)); c31 = float2fix(0.5*cos(31.0*pi/32.0)); //put most of the energy in the 4th basis function for (i=0; i<32; i++) { a32[i] = float2fix(cos(2*pi*2*((float)i+0.5)/32.0)); } //time the DCT TCCR1B = 1 ; TCNT1 = 0; DCT1D_32(); TCCR1B = 0; printf("DCT_32 cycles=%d\n\r",TCNT1) ; //print the spectrum for (i=0; i<32; i++) { printf("%f\n\r",fix2float(S32[i])); } while(1); //spin halt end