Modern Computer Architecture and Programming in Assembly Language

Transcription

1 Modern Computer Architecture and Programming in Assembly Language Moscow State University Faculty of Computational Mathematics and Cybernetics Spring, 2010/2011

2 Course objectives Thread studying C language Understanding C-programs via assembly language Debugging Memory bugs Linkage bugs Performance tuning Malware code analysis Studying machine-level execution model

3 Toolchain

4 Base textbook Computer Systems: A Programmer's Perspective, 2/E (CS:APP2e) Randal E. Bryant and David R. O'Hallaron, Carnegie Mellon University

5 Course organization Online lectures Online workshops Online labs

6 Agenda I. Introduction. 3 sample programs. 1. Hardware organization. Assembly instruction. Data movement. 2. Arithmetic operations. Status flags. Condition Codes. Jump instructions. 3. IA32 stack. Procedures. Call convention. II. C/Assembly mapping in details. 1. «long long» arithmetic 2. Structure and union. Data alignment. 3. Logical, Shift and Rotate Instructions. Bit fields. 4. Conditional move. 5. Loops: reduction to «if-goto» form. 6. Arrays: multidimensional, multilevel. Code optimization: machine- (in)dependent. 7. Switch: if-else chain, jump table, decision tree. 8. cdecl convention. Omit frame pointer. fastcall convention.

7 Von Neumann architecture

8 Modern hardware organization

9 IA32 registers

10 X86-64 registers

11 void f() { static int cntr = 0; // 1 int x = 2, y = 1, z = 0; // 2 unsigned short w = 282; // 3 signed char q = 13; // 4 ++cntr; // 5 z = -x + q * w *y - w; // 6 section.bss ; Allocation 4 byte cntr resd 1 section.text global f ; Entry point f: push ebp mov ebp, esp sub esp, 16 mov dword [ebp-16], 2 ; (1) mov dword [ebp-12], 1 ; (2) mov dword [ebp-8], 0 ; (3) mov word [ebp-4], 282 ; (4) mov byte [ebp-1], 13 ; (5) add dword [cntr], 1 ; (6) movsx eax, byte [ebp-1] ; (7) movzx edx, word [ebp-4] ; (8) imul eax, edx ; (9) imul eax, dword [ebp-12] ; (10) sub eax, dword [ebp-16] ; (11) sub eax, edx ; (12) mov dword [ebp-8], eax ; (13) leave ret

12 Variable location void f() { static int cntr = 0; // 1 int x = 2, y = 1, z = 0; // 2 unsigned short w = 282; // 3 signed char q = 13; // 4 ++cntr; // 5 z = -x + q * w *y - w; // 6

13 Data retrieval byte [ebp - 12] byte [ebp - 11] byte [ebp - 10] byte [ebp 9] dword [ebp - 12] Little-endian

14 Memory segmentation int x = 2, y = 1, z = 0; unsigned short w = 282; signed char q = 13; static int cntr = 0; x = 2; y = 1; z = 0; ++cntr; z = -x + q * w *y - w;

15 Data transfer mov dword [ebp-16], 2 ; (1) mov dword [ebp-12], 1 ; (2) mov dword [ebp-8], 0 ; (3) mov word [ebp-4], 282 ; (4) mov byte [ebp-1], 13 ; (5)

16 nasm: program organization %include "io.inc" section.data var dd 0x1234F00D section.bss cntr resd 1 section.text global CMAIN CMAIN: add [cntr], 1 mov eax, [var] ; macro ; static variables ; zero initialized ; static variables ; code ; entry point

17 I/O macro PRINT_UDEC size, data PRINT_DEC size, data PRINT_HEX size, data PRINT_CHAR ch PRINT_STRING data NEWLINE GET_UDEC size, data GET_DEC size, data GET_HEX size, data GET_CHAR data GET_STRING data, maxsz Program entry point CMAIN stdlib functions CEXTERN io.inc

18 EFLAGS layout

19 Unsigned overflow diagram Positive overflow Negative overflow x + y x - y

20 Signed overflow diagram Positive overflow x - y Negative overflow Negative overflow x + y Positive overflow

21 Arithmetic instructions: flags OF SF ZF PF CF ADD M M M M M SUB M M M M M ADC M M M M TM SBB M M M M TM IMUL M M IDIV NEG M M M M M M = modified, T = tested, - = no effect

22 void f() { int a[16]; int i, x = 99, y = 97; // 1 if (x < y) { // 2 a[0] = 0; // 3 for (i = 1; i < 16; ++i) { // 4 a[i] = y / i; // 5 section.text global f f: push ebp mov ebp, esp sub esp, 88 mov DWORD [ebp-8], 99 ; (1) mov DWORD [ebp-4], 97 ; (2) mov eax, DWORD [ebp-8] ; (3) sub eax, DWORD [ebp-4] ; (4) jge L5 ; (5) mov DWORD [ebp-76], 0 ; (6) mov DWORD [ebp-12], 1 ; (7) L3: L5: cmp DWORD [ebp-12], 15 ; (8) jg L5 ; (9) mov ecx, DWORD [ebp-12] ; (10) mov edx, DWORD [ebp-4] ; (11) mov eax, edx ; (12) sar edx, 31 ; (13) idiv ecx ; (14) mov DWORD [ebp-76+ecx*4], eax ; (15) add DWORD [ebp-12], 1 ; (16) jmp L3 ; (17) leave ret

23 Flowchart void f() { int a[16]; int i, x = 99, y = 97; // 1 if (x < y) { // 2 a[0] = 0; // 3 for (i = 1; i < 16; ++i) { // 4 a[i] = y / i; // 5

24 array layout Stack frame layout

25

26 Push onto stack

27 Pop off stack

28 Stack frame

29 int main() { int a = 1, b = 2, c; c = sum(a, b); return 0; int sum(int x, int y) { int t = x + y; return t; %include io.inc section.text global CMAIN CMAIN: mov DWORD [ebp-16],0x1 ; (1) mov DWORD [ebp-12],0x2 ; (2) mov eax,dword [ebp-12] ; (3) mov DWORD [esp+4],eax ; (4) mov eax,dword [ebp-16] ; (5) mov DWORD [esp],eax ; (6) call sum ; (7) mov DWORD [ebp-8],eax ; (8) global sum sum: push ebp ; (9) mov ebp,esp ; (10) sub esp,0x10 ; (11) mov edx,dword [ebp+12] ; (12) mov eax,dword [ebp+8] ; (13) add eax,edx ; (14) mov DWORD [ebp-4],eax ; (15) mov eax,dword [ebp-4] ; (16) mov esp, ebp ; (17) pop ebp ; (18) ret ; (19)

30 64-bit addition long long f1(long long a, long long b) { long long c; c = a + b; return c; ; ; mov eax, DWORD [ebp+16] ; (1) mov edx, DWORD [ebp+20] ; (2) add eax, DWORD [ebp+8] ; (3) adc edx, DWORD [ebp+12] ; (4)

31 64-bit addition

32 64-bit addition: data flow

33 64-bit subtraction long long f3(long long a, long long b) { long long c; c = a - b; return c; ; ; mov eax, DWORD [ebp+8] ; (1) mov edx, DWORD [ebp+12] ; (2) sub eax, DWORD [ebp+16] ; (3) sbb edx, DWORD [ebp+20] ; (4)

34 64-bit subtraction: data flow

35 long long f2(long long a, long long b) { long long c; c = a * b; return c; globаl f2 f2: push ebp mov ebp, esp sub esp, 8 mov DWORD [esp], ebx ; (1) mov ecx, DWORD [ebp+20] ; (2) mov ebx, DWORD [ebp+8] ; (3) mov DWORD [esp+4], esi ; (4) mov eax, DWORD [ebp+12] ; (5) mov esi, DWORD [ebp+16] ; (6) imul ecx, ebx ; (7) imul eax, esi ; (8) add ecx, eax ; (9) mov eax, esi ;(10) mul ebx ;(11) mov ebx, DWORD [esp] ;(12) lea esi, [ecx+edx] ;(13) mov edx, esi ;(14) mov esi, DWORD [esp+4] ;(15) mov esp, ebp pop ebp ret

36 64-bit multiplication

37 64-bit multiplication: data flow

38 Contest #1: expression evaluation 7 word problems Solve 5 problems for grade «excellent» Submit via e-judge: Sample problem «Watch out for overflow»

39 Contest #1: «Watch out for overflow» A water tank is a rectangular parallelepiped and has dimensions AxBxC decimeters. A pipe is connected to the tank. The pipe has a throughput of V liters per minute. Determine the number of minutes the valve on the pipe has to be opened for so that the tank gets filled with as much water as possible but without an overflow. The construction of the pipe and valve allows only the maximum throughput, and the valve can be open only for a whole number of minutes. The standard input contains four space-delimited numbers: A, B, C, and V. All numbers are positive integers and do not exceed 2*10 9. Print to the standard output the number of minutes for which the valve is to be opened. It is guaranteed that the correct answer will never exceed 2*10 9. Do not use conditional control and data transfer instructions. Time limit: 1 second Memory limit: 64 MB

40 Contest #1: e-judge

41 Structure field allocation struct rec { int i; int j; int a[3]; int *p; struct rec *x; x->j = x->i; mov edx, dword [x] ; (1) mov eax, dword [edx] ; (2) mov dword [edx + 4], eax ; (3)

42 Structure field access struct rec { int i; int j; int a[3]; int *p; ; mov edx, dword [i] ; (1) mov eax, dword [x] ; (2) lea eax, [eax + 4 * edx + 8] ; (3) struct rec *x; int i; &(r->a[i]);

43 Structure field access struct rec { int i; int j; int a[3]; int *p; ; mov edx, dword [r] ; (1) mov eax, dword [edx + 4] ; (2) add eax, dword [edx] ; (3) lea eax, [edx + 4 * eax + 8] ; (4) mov dword [edx + 20], eax ; (5) struct rec *r; r->p = &r->a[r->i + r->j];

44 struct vs. union // (1) wrong struct NODE_S { struct NODE_S *left; struct NODE_S *right; double data; ; // (2) not bed union NODE_U { struct { union NODE_U *left; union NODE_U *right; internal; double data; ; // (3) correct typedef enum { N_LEAF, N_INTERNAL nodetype_t; struct NODE_T { nodetype_t type; union NODE_U { struct { struct NODE_T *left; struct NODE_T *right; internal; double data; info; ;

45 union vs. copy unsigned float2bit(float f) { union { float f; unsigned u; temp; temp.f = f; return temp.u; global float2bit float2bit: push ebp mov ebp, esp mov eax, dword [ebp + 8] mov esp, ebp pop ebp ret unsigned copy(unsigned u) { return u;

46 Data Alignment typedef struct { int i; char c; int j; trifield1; // (2) typedef struct { int i; int j; char c; trifield2; // (3)

47 Logical Instructions int pierce_arrow(int a, int b) { int t = ~(a b); return t; section.text global pierce_arrow pierce_arrow: push ebp mov ebp, esp mov eax, DWORD [ebp+12] ; (1) or eax, DWORD [ebp+8] ; (2) not eax ; (3) pop ebp ret

48 Shift left

49 Shift logical right

50 Shift arithmetic right

51 Shift: integer promotion char upndown(char x) { return (x << 8) >> 8; section.text global upndown upndown: push ebp mov ebp, esp movsx eax, BYTE [ebp+8] sal eax, 8 sar eax, 8 pop ebp ret

52 Rotate instructions

53 unsigned sha256_f1(unsigned x) { unsigned t; t = ((x >> 2) (x << ((sizeof(x) << 3) - 2))); // (1) t ^= ((x >> 13) (x << ((sizeof(x) << 3) - 13))); // (2) t ^= ((x >> 22) (x << ((sizeof(x) << 3) - 22))); // (3) return t; global sha256_f1 sha256_f1: push ebp mov ebp, esp mov edx, DWORD [ebp+8] ; (1) pop ebp ; (2) mov eax, edx ; (3) mov ecx, edx ; (4) ror eax, 13 ; (5) ror ecx, 2 ; (6) xor eax, ecx ; (7) ror edx, 22 ; (8) xor eax, edx ; (9) ret

54 Special arithmetic int arith(int x, int y, int z) { int t1 = x + y; int t2 = z * 48; int t3 = t1 & 0xFFFF; int t4 = t2 * t3; return t4; ; mov eax, dword [ebp + 16] ; (1) lea eax, [eax + 2 * eax] ; (2) sal eax, 4 ; (3) mov edx, dword [ebp + 12] ; (4) add edx, dword [ebp + 8] ; (5) and edx, ; (6) imul eax, edx ; (7) ;

55 Bit field struct omg { int a : 3; int b : 5; int c : 2; unsigned cntr: 31; int sum : 8; ; void f(struct omg *p) { p->cntr++; // 1 p->b = (p->c << 3) (p->a); // 2 p->sum = p->a + p->b + p->c; // 3 section.text global f f: ; mov esi, DWORD [ebp+8] ; load mov eax, DWORD [esi+4] ; cntr lea edx, [eax+1] ; cntr++ and eax, ; mask and edx, ; mask or eax, edx ; merge mov DWORD [esi+4], eax ; store ;

56 Bit field struct omg { int a : 3; int b : 5; int c : 2; unsigned cntr: 31; int sum : 8; ; void f(struct omg *p) { p->cntr++; // 1 p->b = (p->c << 3) (p->a); // 2 p->sum = p->a + p->b + p->c; // 3 section.text global f f: ; movzx ebx, BYTE [esi+1] ; p->c sal ebx, 6 ; << sar bl, 3 ; 3 movzx edx, BYTE [esi] ; mov eax, edx ; and edx, 7 ; sal eax, 5 ; sar al, 5 ; p->a or ebx, eax ; sal ebx, 3 ; or edx, ebx ; mov BYTE [esi], dl ; ;

60 Bit field struct omg { int a : 3; int b : 5; int c : 2; unsigned cntr: 31; int sum : 8; ; void f(struct omg *p) { p->cntr++; // 1 p->b = (p->c << 3) (p->a); // 2 p->sum = p->a + p->b + p->c; // 3 section.text global f f: ; movzx ebx, BYTE [esi+1] ; sal ebx, 6 ; sar bl, 6 ; p->c movzx edx, BYTE [esi] ; sal edx, 5 ; sar dl, 5 ; p->a movzx ecx, BYTE [esi] ; sar cl, 3 ; p->b add ebx, edx ; add ebx, ecx ; mov BYTE [esi+8], bl ; pop ebx ; pop esi ; pop ebp ; ret ;

61 Jcc Condition Description JE ZF Equal / Zero JNE ~ZF Not Equal / Not Zero JS SF Negative JNS ~SF Non-negative JG ~(SFÔF)&~ZF Greater (signed) JGE ~(SFÔF) Greater or Equal (signed) JL (SFÔF) Less (signed) JLE (SFÔF) ZF Less or Equal (signed) JA ~CF&~ZF Above (unsigned) JB CF Below (unsigned)

62 int absdiff(int x, int y) { int result; if (x > y) { result = x-y; else { result = y-x; return result; absdiff: push ebp mov ebp, esp mov edx, dword [8 + ebp] ; (1) mov eax, dword [12 + ebp] ; (2) cmp edx, eax ; (3) jle.l6 ; (4) sub edx, eax ; (5) mov eax, edx ; (6) jmp.l7 ; (7).L6: ; (8) sub eax, edx ; (9).L7: ; (10) pop ebp ret

63 int goto_ad(int x, int y) { int result; if (x <= y) goto Else; result = x-y; goto Exit; Else: result = y-x; Exit: return result; absdiff: push ebp mov ebp, esp mov edx, dword [8 + ebp] ; (1) mov eax, dword [12 + ebp] ; (2) cmp edx, eax ; (3) jle.l6 ; (4) sub edx, eax ; (5) mov eax, edx ; (6) jmp.l7 ; (7).L6: ; (8) sub eax, edx ; (9).L7: ; (10) pop ebp ret

64 val = Test? Then_Expr : Else_Expr; val = x>y? x-y : y-x; nt =!Test; if (nt) goto Else; val = Then_Expr; goto Done; Else: val = Else_Expr; Done:... tmp_val = Then_Expr; result = Else_Expr; t = Test; if (t) result = tmp_val; return result;

65 int absdiff(int x, int y) { int result; if (x > y) { result = x-y; else { result = y-x; return result; x loaded in edi y loaded in esi absdiff: mov edx, edi sub edx, esi ; tmp_val:edx = x-y mov eax, esi sub eax, edi ; result:eax = y-x cmp edi, esi ; Compare x:y cmovg eax, edx ; If >, result:eax = tmp_val:edx ret

66 int pcount_do(unsigned x) { int result = 0; do { result += x & 0x1; x >>= 1; while (x); return result; int pcount_do(unsigned x) { int result = 0; loop: result += x & 0x1; x >>= 1; if (x) goto loop; return result;

67 int pcount_do(unsigned x) { int result = 0; loop: result += x & 0x1; x >>= 1; if (x) goto loop; return result; mov ecx, 0 ; result = 0.L2: ; loop: mov eax, edx and eax, 1 ; t = x & 1 add ecx, eax ; result += t shr edx, 1 ; x >>= 1 jne.l2 ; If!0, goto loop Register allocation: edx x ecx result

68 int pcount_while(unsigned x) { int result = 0; while (x) { result += x & 0x1; x >>= 1; return result; int pcount_do(unsigned x) { int result = 0; if (!x) goto done; loop: result += x & 0x1; x >>= 1; if (x) goto loop; done: return result; int pcount_do(unsigned x) { int result = 0; loop: if (!x) goto done; result += x & 0x1; x >>= 1; goto loop; done: return result;

69 #define WSIZE 8*sizeof(int) int pcount_for(unsigned x) { int i; int result = 0; for (i = 0; i < WSIZE; i++) { unsigned mask = 1 << i; result += (x & mask)!= 0; return result;

70 #define WSIZE 8*sizeof(int) int pcount_for(unsigned x) { int i; int result = 0; for (i = 0; i < WSIZE; i++) { unsigned mask = 1 << i; result += (x & mask)!= 0; return result; int pcount_for_gt(unsigned x) { int i; int result = 0; i = 0; if (!(i < WSIZE)) goto done; loop: { unsigned mask = 1 << i; result += (x & mask)!= 0; i++; if (i < WSIZE) goto loop; done: return result;

71 #define WSIZE 8*sizeof(int) int pcount_for(unsigned x) { int i; int result = 0; for (i = 0; i < WSIZE; i++) { unsigned mask = 1 << i; result += (x & mask)!= 0; return result; int pcount_for_gt(unsigned x) { int i; int result = 0; i = 0; if (!(i < WSIZE)) goto done; loop: { unsigned mask = 1 << i; result += (x & mask)!= 0; i++; if (i < WSIZE) goto loop; done: return result;

72 int fib(int x) { // x >= 1 int i; int predpred = 0; int pred = 1; int res = 1; x--; for (i = 0; i < x; i++) { res = predpred + pred; predpred = pred; pred = res; return res; fib: push ebp mov ebp, esp push ebx mov ecx, dword [ebp + 8] ; x xor edx, edx ; predpred mov ebx, 1 ; pred mov eax, 1 ; res dec ecx jecxz.end.loop: lea eax, [edx + ebx] mov edx, ebx mov ebx, eax loop.loop.end: pop ebx pop ebp ret

74 .end: pop ebx pop ebp ret int fib(int x) { // x >= 1 int i; int predpred = 0; int pred = 1; int res = 1; x--; for (i = 0; i < x; i++) { res = predpred + pred; predpred = pred; pred = res; return res; fib: push ebp mov ebp, esp push ebx mov ecx, dword [ebp + 8] ; x xor edx, edx ; predpred mov ebx, 1 ; pred mov eax, 1 ; res dec ecx jecxz.end.loop: lea eax, [edx + ebx] mov edx, ebx mov ebx, eax loop.loop

76 Integer values Stored and processed in general purpose registers Signed/unsigned values Intel ASM Bytes C byte b 1 [unsigned] char word w 2 [unsigned] short double word d 4 [unsigned] int quad word q 8 [unsigned] long long int Floating-point values Stored and processed in special floating-point registers Intel ASM Bytes C Single d 4 float Double q 8 double

77 Arrays layout in memory T A[L]; Array of elements of type T, array length is L Stored in a contiguous memory block of size L * sizeof(t) bytes char string[12]; x x + 12 int val[5]; x x + 4 x + 8 x + 12 x + 16 x + 20 double a[3]; x x + 8 x + 16 x + 24 char *p[3]; x x + 4 x + 8 x + 12

78 Array element access T A[L]; Array of elements of type T, array length is L The identifier A can be used as a pointer to element 0. Pointer type is T* int val[5]; Reference Type Value val[4] int 3 val int * x val+1 int * x + 4 &val[2] int * x + 8 val[5] int?? *(val+1) int 5 val + i int * x + 4 i x x + 4 x + 8 x + 12 x + 16 x + 20

79 #define ZLEN 5 typedef int zip_dig[zlen]; zip_dig cmu = { 1, 5, 2, 1, 3 ; zip_dig mit = { 0, 2, 1, 3, 9 ; zip_dig ucb = { 9, 4, 7, 2, 0 ; zip_dig cmu; zip_dig mit; zip_dig ucb; Declaration zip_dig cmu is equivalent to int cmu[5] Arrays are laid out in contiguous memory blocks 20 bytes each Generally it is not guaranteed that individual arrays are laid out without gaps between them

80 zip_dig cmu; int get_digit (zip_dig z, int dig) { return z[dig]; The edx register contains starting (base) array address The eax register contains element index ; edx = z ; eax = dig mov eax, dword [edx+4*eax] # z[dig] Element address is edx + 4 * eax

81 void zincr(zip_dig z) { int i; for (i = 0; i < ZLEN; i++) z[i]++; ; edx = z mov eax, 0 ; eax = i.l4: ; loop: add dword [edx + 4 * eax], 1 ; z[i]++ add eax, 1 ; i++ cmp eax, 5 ; i vs. 5 jne.l4 ; if (!=) goto loop

82 void zincr_p(zip_dig z) { int *zend = z+zlen; do { (*z)++; z++; while (z!= zend); void zincr_v(zip_dig z) { void *vz = z; int i = 0; do { (*((int *) (vz+i)))++; i += ISIZE; while (i!= ISIZE*ZLEN);.L8: ; edx = z = vz movl eax, 0 ; i = 0 ; loop: add dword [edx + eax], 1 ; Increment vz+i add eax, 4 ; i += 4 cmp eax, 20 ; i vs. 20 jne.l8 ; if (!=) goto loop

83 #define PCOUNT 4 zip_dig pgh[pcount] = {{1, 5, 2, 0, 6, {1, 5, 2, 1, 3, {1, 5, 2, 1, 7, {1, 5, 2, 2, 1 ; zip_dig pgh[4]; zip_dig pgh[4] is equivalent to int pgh[4][5] Variable pgh: array of 4 elements contiguously stored in memory Each element is an array of 5 int s contiguously stored in memory. Rows are laid out first (Row-Major)

84 Declaration T A[R][C]; 2D array of element of type T R rows, C columns Size of type T is K bytes Array size R * C * K bytes Layout in memory Rows first A[0][0] A[R-1][0] A[0][C-1] A[R-1][C-1] int A[R][C]; A [0] [0] A [0] [C-1] A [1] [0] A [1] [C-1] A [R-1] [0] A [R-1] [C-1] 4*R*C bytes

85 Row access A[i] is an array of C elements Each element of type T requires K bytes Start address of row i A + i * (C * K) int A[R][C]; A[0] A[i] A[R-1] A [0] [0] A [0] [C-1] A [i] [0] A [i] [C-1] A [R-1] [0] A [R-1] [C-1] A A+i*C*4 A+(R-1)*C*4

86 int *get_pgh_zip(int index){ return pgh[index]; #define PCOUNT 4 zip_dig pgh[pcount] = {{1, 5, 2, 0, 6, {1, 5, 2, 1, 3, {1, 5, 2, 1, 7, {1, 5, 2, 2, 1 ; ; eax = index lea eax, [eax + 4 * eax] ; 5 * index lea eax, [pgh + 4 * eax] ; pgh + (20 * index) pgh[index] is an array of 5 int s Starting address is pgh+20*index Address is calculated and returned Address is calculated as pgh + 4*(index+4*index)

87 Array elements A[i][j] is element of type T, requiring K bytes Element address is A + i * (C * K) + j * K = A + (i * C + j)* K int A[R][C]; A[0] A[i] A[R-1] A [0] [0] A [0] [C-1] A [i] [j] A [R-1] [0] A [R-1] [C-1] A A+i*C*4 A+i*C*4+j*4 A+(R-1)*C*4

88 int get_pgh_digit (int index, int dig) { return pgh[index][dig]; mov eax, dword [ebp + 8] ; index lea eax, [eax + 4 * eax] ; 5*index add eax, dword [ebp + 12] ; 5*index+dig mov eax, dword [pgh + 4 * eax] ; offset 4*(5*index+dig) pgh[index][dig] has int type Address: pgh + 20*index + 4*dig = = pgh + 4*(5*index + dig) Address is calculated as pgh + 4*((index+4*index)+dig)

89 zip_dig cmu = { 1, 5, 2, 1, 3 ; zip_dig mit = { 0, 2, 1, 3, 9 ; zip_dig ucb = { 9, 4, 7, 2, 0 ; #define UCOUNT 3 int *univ[ucount] = {mit, cmu, ucb; The univ variable is an array of 3 elements Each element is a 4-byte pointer Each pointer references an array of ints univ cmu mit ucb

90 int get_univ_digit (int index, int dig) { return univ[index][dig]; mov eax, dword [ebp + 8] ; index mov edx, dword [univ + 4 * eax] ; p = univ[index] mov eax, dword [ebp + 12] ; dig mov eax, dword [edx + 4 * eax] ; p[dig] Access to element Mem[Mem[univ+4*index]+4*dig] Two memory reads are required First one obtains pointer to a one-dimensional array Second one fetches required element from the onedimensional array

91 Multiple dimension array int get_pgh_digit (int index, int dig) { return pgh[index][dig]; Multiple level array int get_univ_digit (int index, int dig) { return univ[index][dig]; Similar in C Significant difference in assembly Mem[pgh+20*index+4*dig] Mem[Mem[univ+4*index]+4*dig]

92 N x N matrix Fixed dimensions N is known at compile time Dynamic dimensions require explicit index calculation Traditional way to implement multiple dimension arrays Dynamic dimensions with implicit indexing Supported in fresh gcc versions #define N 16 typedef int fix_matrix[n][n]; /* Get element a[i][j] */ int fix_ele (fix_matrix a, int i, int j){ return a[i][j]; #define IDX(n, i, j) ((i)*(n)+(j)) /* Get element a[i][j] */ int vec_ele (int n, int *a, int i, int j){ return a[idx(n,i,j)]; /* Get element a[i][j] */ int var_ele (int n, int a[n][n], int i, int j){ return a[i][j];

93 16 X 16 matrix Element access Address A + i * (C * K) + j * K C = 16, K = 4 /* Retrieval of element a[i][j] */ int fix_ele(fix_matrix a, int i, int j) { return a[i][j]; mov edx, dword [ebp + 12] ; i sal edx, 6 ; i*64 mov eax, dword [ebp + 16] ; j sal eax, 2 ; j*4 add eax, dword [ebp + 8] ; a + j*4 mov eax, dword [eax + edx] ; *(a + j*4 + i*64)

94 n X n matrix Element access Address A + i * (C * K) + j * K C = n, K = 4 /* Retrieval of element a[i][j] */ int var_ele(int n, int a[n][n], int i, int j) { return a[i][j]; mov eax, dword [ebp + 8] ; n sal eax, 2 ; n*4 mov edx, eax ; n*4 imul edx, dword [ebp + 16] ; i*n*4 mov eax, dword [ebp + 20] ; j sal eax, 2 ; j*4 add eax, dword [ebp + 12] ; a + j*4 mov eax, dword [eax + edx] ; *(a + j*4 + i*n*4)

95 Optimizing array element access a jth column #define N 16 typedef int fix_matrix[n][n]; Calculations Process all elements in column j Optimization Fetch individual elements of the column /* Fetch of array column j */ void fix_column (fix_matrix a, int j, int *dest) { int i; for (i = 0; i < N; i++) dest[i] = a[i][j];

96 Optimizing array element access Optimization Calculate ajp = &a[i][j] Register ecx ebx edx Initial value is a + 4*j Step is 4*N Value ajp dest i /* Fetch of array column j */ void fix_column (fix_matrix a, int j, int *dest) { int i; for (i = 0; i < N; i++) dest[i] = a[i][j];.l8: ; loop: mov eax, dword [ecx] ; get *ajp mov dword [ebx + 4 * edx], eax ; store in dest[i] add edx, 1 ; i++ add ecx, 64 ; ajp += 4*N cmp edx, 16 ; i vs. N jne.l8 ; if!=, goto loop

97 Calculate ajp = &a[i][j] Initial value is a + 4*j Optimizing array element access Step is 4*n /* Fetch of array column j */ void var_column Register Value (int n, int a[n][n], ecx ajp int j, int *dest) edi dest { edx i int i; for (i = 0; i < n; i++) ebx 4*n dest[i] = a[i][j]; esi n.l18: ; loop: mov eax, dword [ecx] ; get *ajp mov dword [edi + 4 * edx], eax ; store in dest[i] add edx, 1 ; i++ add ecx, ebx ; ajp += 4*n cmp esi, edx ; n vs. i jg.l18 ; if (>) goto loop

98 Optimizing array element access Change loop direction Exit loop on zero counter Negative step Initial pointer values change It is sufficient to compare only a single index against 0 /* Fetch of array column j */ void var_column (int n, int a[n][n], int j, int *dest) { int i; for (i = n-1; i >=0; i--) dest[i] = a[i][j];.l18: ; loop: mov eax, dword [ecx] ; get *ajp mov dword [edi + 4 * edx], eax ; store in dest[i] add edx, 1 ; i++ add ecx, ebx ; ajp += 4*n cmp esi, edx ; n vs. i jg.l18 ; if (>) goto loop

99 Optimizing array element access Register Initial value ecx a+4*n*(n-1)+4*j edi dest 4 edx n ebx 4*n esi unused now Machine-dependent optimization /* Fetch of array column j */ void var_column (int n, int a[n][n], int j, int *dest) { int i; dest--; for (i = n; i!= 0; i--) dest[i] = a[i-1][j];.l18: ; loop: mov eax, dword [ecx] ; get *(ajp+ ) mov dword [edi + 4 * edx], eax ; store in dest[i] sub ecx, ebx ; ajp -= 4*n sub edx, 1 ; i-- jnz.l18 ; if (!=) goto loop

100 Contest #2:branches, loops, arrays 5 word problems 2 reverse engineering problems Solve any 5 problems for grade «excellent», but at least one reverse engineering problem. Submit via e-judge: Sample word problem «Local extrema» Sample reverse engineering problem «R2»

101 Contest #2: «Local extrema» Let us define local minimum of an integer sequence to be such an element that is strictly less than both its neighbors. Let us define local maximum of an integer sequence to be such an element that is strictly greater than both its neighbors. The standard input contains a non-negative integer N <= followed by N 32-bit integers comprising the sequence. Print to the standard output first the number m of local minimums in the sequence followed by their indices. Then print the number M of local maximums followed by their indices. Indexing starts at 0. First and last sequence elements cannot be its local extrema. Time limit: 1 second Memory limit: 64 MB

102 Contest #2: «R2» Given the following assembly language program, recover its semantics and express it as a C language program. The input is a 32-bit unsigned integer. Time limit: 1 second Memory limit: 64 MB %include "io.inc" SECTION.text GLOBAL CMAIN CMAIN: GET_UDEC 4, EAX MOV EBX, EAX DEC EBX XOR EAX, EBX ADD EAX, 1 RCR EAX, 1 PRINT_UDEC 4, EAX NEWLINE XOR EAX, EAX RET

103 CDECL Where parameters are placed stack Parameter order «reverse»: from stack «top» to «bottom» Which registers may be used by the function EAX, EDX, ECX Whether the caller or the callee is responsible for cleaning up the stack on return Caller cleans Return values EAX EAX:EDX In memory

104 CDECL Parameters placement Integer Actual value Pointer -> Integer Actual value Array -> Pointer Reference Structure/union Actual value

105 Function main #include <stdio.h> int v; void nullify(int argc, char* argv[]); int main(int argc, char* argv[]) { nullify(argc, argv); return 0; void nullify(int argc, char* argv[]) { CMAIN: lea ecx, [esp+4] and esp, -16 push dword [ecx-4] push ebp mov ebp, esp push ecx sub esp, 20 mov eax, dword [ecx+4] mov dword [esp+4], eax mov eax, dword [ecx] mov dword [esp], eax call nullify mov eax, 0 add esp, 20 pop ecx pop ebp lea esp, [ecx-4] ret nullify: ret

106 Stack alignment

107 STDCALL #include <stdio.h> attribute ((stdcall)) int sum(int x, int y); int main() { int a = 1, b = 2, c; c = sum(a, b); printf("%d\n", c); return 0; sum: push ebp mov ebp, esp sub esp, 16 mov edx, DWORD [ebp+12] mov eax, DWORD [ebp+8] add eax, edx mov DWORD [ebp-4], eax mov eax, DWORD [ebp-4] leave ret 8 attribute ((stdcall)) int sum(int x, int y) { int t = x + y; return t;

108 STDCALL #include <stdio.h> attribute ((stdcall)) int sum(int x, int y); int main() { int a = 1, b = 2, c; c = sum(a, b); printf("%d\n", c); return 0; CMAIN: ; mov eax, DWORD [ebp-12] mov DWORD [esp+4], eax mov eax, DWORD [ebp-16] mov DWORD [esp], eax call sum sub esp, 8 mov DWORD [ebp-8], eax ; attribute ((stdcall)) int sum(int x, int y) { int t = x + y; return t;

109 FASTCALL #include <stdio.h> attribute ((fastcall)) int sum(int x, int y); int main() { int a = 1, b = 2, c; c = sum(a, b); printf("%d\n", c); return 0; attribute ((fastcall)) int sum(int x, int y) { int t = x + y; return t; CMAIN: ; mov mov call mov ; sum: lea ret edx, DWORD [ebp-12] ecx, DWORD [ebp-16] sum DWORD [ebp-8], eax eax, [ecx + edx]

110 Omit frame pointer void f(int x, int y) { int numerator = (x + y) * (x - y); int denominator = x * x + y * y; if (0 == denominator) { denominator = 1; return (100 * numerator) / denominator; Register esi ecx Value y X f: ; setup sub esp, 8 mov DWORD [esp+4], esi mov esi, DWORD [esp+16] mov ecx, DWORD [esp+12] mov DWORD [esp], ebx ; Saved Register address esi [esp + 4] ebx [esp]

111 Omit frame pointer void f(int x, int y) { int numerator = (x + y) * (x - y); int denominator = x * x + y * y; if (0 == denominator) { denominator = 1; return (100 * numerator) / denominator; Register esi ecx Value y X f: ; mov edx, esi imul edx, esi ; edx = y^2 mov eax, ecx imul eax, ecx ; eax = x^2 mov ebx, edx add ebx, eax ; ebx = x^2 + y^2 jne.l2 mov ebx, 1.L2 ;

112 Omit frame pointer void f(int x, int y) { int numerator = (x + y) * (x - y); int denominator = x * x + y * y; if (0 == denominator) { denominator = 1; return (100 * numerator) / denominator; Register esi ecx ebx Value y x x^2 + y^2 f: ;.L2 lea sub imul ; edx, [esi+ecx] ecx, esi edx, ecx

113 Omit frame pointer void f(int x, int y) { int numerator = (x + y) * (x - y); int denominator = x * x + y * y; if (0 == denominator) { denominator = 1; return (100 * numerator) / denominator; f: ; imul edx, edx, 100 mov eax, edx sar edx, 31 idiv ebx ; Register Value esi y ecx x ebx x^2 + y^2 edx (x + y) * (x - y)

114 Omit frame pointer void f(int x, int y) { int numerator = (x + y) * (x - y); int denominator = x * x + y * y; if (0 == denominator) { denominator = 1; return (100 * numerator) / denominator; f: ; ; finish mov esi, DWORD [esp+4] mov ebx, DWORD [esp] add esp, 8 ret

115 Variable-length parameter list An ellipsis (...) are placed at the end of a parameter list. Data type va_list Macro va_start(va_list, last fixed param) va_arg(va_list, cast type) va_end(va_list)

116 Variable-length parameter list #include <stdarg.h> int average(int count,...) { va_list ap; int j; int sum = 0; va_start(ap, count); for (j=0; j<count; j++) sum += va_arg(ap, int); va_end(ap); return sum/count;

117 Contest #3: function call 5 word problems 2 reverse engineering problems Solve any 5 problems for grade «excellent», but at least one reverse engineering problem. Submit via e-judge Sample word problem «GCD of Four» Sample reverse engineering problem «R3»

118 Contest #3: «GCD of Four» The standard input contains four integers each greater than zero and less than or equal to Print to the standard output their greatest common divisor. Time limit: 1 second Memory limit: 64 MB

119 Contest #3: «R3» Given the following assembly language program, recover its semantics and express it as a C language program. The input contains a single integer in bounds 0 to 20, inclusive. Time limit: 1 second Memory limit: 64 MB %include "io.inc" SECTION.text GLOBAL CMAIN CMAIN: GET_UDEC 4, EAX CALL F PRINT_UDEC 4, EAX NEWLINE XOR EAX, EAX RET F: CMP EAX, 0 JNZ.REC MOV EAX, 1 RET.REC: DEC EAX CALL F LEA EAX, [EAX + 2 * EAX] RET

120 Acknowledgement We are grateful to Randal E. Bryant and David R. O'Hallaron for great textbook and other course materials we found on the site: Especially we used samples for the following themes: 1. Loops: reduction to «if-goto» form. 2. Arrays: multidimensional, multilevel. 3. Loops: machine-independent code optimization. 4. Switch: jump table.

121 Final exam 10 problems Grading policy Max 6 point for each problem: 60 points total Grade «excellent» >= 48 points (0.8) Grade «good» >= 36 points (0.6) Grade «poor» >= 24 points (0.4)

122 Sample problem #1 Fill in register AL value in hex and in decimal (signed and unsigned), and values of flags CF, OF, ZF and SF after execution of the following instructions. (a) MOV AL, 137 ADD AL, 200 Answer: AL = (hex), (signed dec), (unsigned dec), CF =, OF =, ZF =, SF =. (b) MOV AL, -35 SUB AL, 216 Answer: AL = (hex), (signed dec), (unsigned dec) CF =, OF =, ZF =, SF =.

123 Sample problem #2 Assuming variable A containing the value 0xCAFE BABE, write out register AX value in hex after execution of the following instructions. MOV AX, WORD [A + 2] ADD AX, 3 ; Answer: AX =

124 Sample problem #3 Let register EAX contain a positive integer x <= Write out two variants, both consisting of a single assembly instruction, that multiply x by 5. The result is to remain in EAX. Two variants are considered distinct if mnemonics of the used instructions are different. Answer 1: Answer 2:

125 Sample problem #4 Write a program in assembly equivalent to the following C code fragment. short *px, *py; *px++ = --*py;

126 Sample problem #5 Write a program in assembly equivalent to the following C code fragment. int x, y; x /= -y;

127 Sample problem #6 Write a C code fragment equivalent to the following assembly fragment. Explain in your own words what the code does. SECTION.text GLOBAL foo foo: MOV ESI, DWORD [a] TEST ESI, ESI JE.1 MOV ECX, DWORD [b] TEST ECX, ECX JE.1 MOV EDX, DWORD [ESI] MOV EAX, EDX SAR EDX, 31 IDIV ECX SUB DWORD [ESI], EDX.1: XOR EAX, EAX RET

128 Sample problem #7 A C function f has the following body. *p = d; return x - c; MOVSX EDX, BYTE [EBP + 12] MOV EAX, DWORD [EBP + 16] MOV DWORD [EAX], EDX MOVSX EAX, WORD [EBP + 8] MOV EDX, DWORD [EBP + 20] SUB EDX, EAX MOV EAX, EDX This body corresponds to the following assembly code. Recover the function f prototype declaration.

129 Sample problem #8 Write a function in assembly that calculates for given n and k k the number of combinations C n : k k 1 k C C, for all integers n, k > 0, Cn n 1 n 1 C 0 1, for all integers n, n C k 0 0, for all integers k > 0. The function must correspond to the following C declaration and be implemented recursively. unsigned int combinations(unsigned int n, unsigned int k);

130 Sample problem #9 Write an assembly program that prints a sum of all odd elements of the principal diagonal of matrix int A[N][N], where N is a compile-time constant. No matrix input code is required.

131 Sample problem #10 Write a C code fragment equivalent to the following assembly fragment. Explain in your own words what the code does. %include "io.inc" SECTION.text GLOBAL CMAIN CMAIN: GET_DEC 4, ECX MOV EBX, 1 XOR EAX, EAX.L: XOR EAX, EBX XOR EBX, EAX XOR EAX, EBX ADD EBX, EAX LOOP.L PRINT_UDEC 4, EAX NEWLINE XOR EAX, EAX RET