modern computer architecture and programming in assembly
Post on 26-Oct-2021
7 Views
Preview:
TRANSCRIPT
Modern Computer Architecture and Programming in Assembly Language
Moscow State University Faculty of Computational Mathematics and Cybernetics
Spring, 2010/2011
Course objectives
• Thread studying C language
• Understanding C-programs via assembly language– Debugging
• Memory bugs
• Linkage bugs
– Performance tuning
– Malware code analysis
• Studying machine-level execution model
Toolchain
• Computer Systems: A Programmer's Perspective, 2/E (CS:APP2e)
Randal E. Bryant and David R. O'Hallaron,
Carnegie Mellon University
Base textbook
Course organization
• Online lectures – http://asmcourse.cs.msu.ru/
• Online workshops– http://algcourse.cs.msu.su/teachwiki/
• Online labs– http://earth.ispras.ru
Agenda
I. Introduction. 3 sample programs.1. Hardware organization. Assembly instruction. Data movement.
2. Arithmetic operations. Status flags. Condition Codes. Jump instructions.
3. IA32 stack. Procedures. Call convention.
II. C/Assembly mapping in details.1. «long long» arithmetic
2. Structure and union. Data alignment.
3. Logical, Shift and Rotate Instructions. Bit fields.
4. Conditional move.
5. Loops: reduction to «if-goto» form.
6. Arrays: multidimensional, multilevel. Code optimization: machine-(in)dependent.
7. Switch: if-else chain, jump table, decision tree.
8. cdecl convention. Omit frame pointer. fastcall convention.
Von Neumann architecture
Modern hardware organization
IA32 registers
X86-64 registers
void f() {
static int cntr = 0; // 1
int x = 2, y = 1, z = 0; // 2
unsigned short w = 282; // 3
signed char q = 13; // 4
++cntr; // 5
z = -x + q * w *y - w; // 6
}
section .bss
; Allocation – 4 byte
cntr resd 1
section .text
global f
; Entry point
f:
push ebp
mov ebp, esp
sub esp, 16
mov dword [ebp-16], 2 ; (1)
mov dword [ebp-12], 1 ; (2)
mov dword [ebp-8], 0 ; (3)
mov word [ebp-4], 282 ; (4)
mov byte [ebp-1], 13 ; (5)
add dword [cntr], 1 ; (6)
movsx eax, byte [ebp-1] ; (7)
movzx edx, word [ebp-4] ; (8)
imul eax, edx ; (9)
imul eax, dword [ebp-12] ; (10)
sub eax, dword [ebp-16] ; (11)
sub eax, edx ; (12)
mov dword [ebp-8], eax ; (13)
leave
ret
Variable location
void f() {static int cntr = 0; // 1int x = 2, y = 1, z = 0; // 2unsigned short w = 282; // 3signed char q = 13; // 4++cntr; // 5z = -x + q * w *y - w; // 6
}
Data retrieval
byte [ebp - 12]
byte [ebp - 11]
byte [ebp - 10]
byte [ebp – 9]
dword [ebp - 12]
Little-endian
Memory segmentation
int x = 2, y = 1, z = 0;unsigned short w = 282;signed char q = 13;
static int cntr = 0;
x = 2;y = 1;z = 0;++cntr;z = -x + q * w *y - w;
Data transfer
mov dword [ebp-16], 2 ; (1)mov dword [ebp-12], 1 ; (2)mov dword [ebp-8], 0 ; (3)mov word [ebp-4], 282 ; (4)mov byte [ebp-1], 13 ; (5)
%include "io.inc" ; macro
section .data ; static variablesvar dd 0x1234F00D
section .bss ; zero initializedcntr resd 1 ; static variables
section .text ; codeglobal CMAINCMAIN: ; entry point
add [cntr], 1mov eax, [var]
nasm: program organization
io.inc
• I/O macro– PRINT_UDEC size, data
– PRINT_DEC size, data
– PRINT_HEX size, data
– PRINT_CHAR ch
– PRINT_STRING data
– NEWLINE– GET_UDEC size, data
– GET_DEC size, data
– GET_HEX size, data
– GET_CHAR data
– GET_STRING data, maxsz
• Program entry point– CMAIN
• stdlib functions– CEXTERN
EFLAGS layout
Unsigned overflow diagram
Positive overflow
Negative overflow
x - yx + y
Signed overflow diagram
Positive overflow
Negative overflow
x - y
x + y
Positive overflow
Negative overflow
OF SF ZF PF CF
ADD M M M M M
SUB M M M M M
ADC M M M M TM
SBB M M M M TM
IMUL M - - - M
IDIV - - - - -
NEG M M M M M
Arithmetic instructions: flags
M = modified, T = tested, - = no effect
void f() {
int a[16];
int i, x = 99, y = 97; // 1
if (x < y) { // 2
a[0] = 0; // 3
for (i = 1; i < 16; ++i) { // 4
a[i] = y / i; // 5
}
}
}
section .text
global f
f:
push ebp
mov ebp, esp
sub esp, 88
mov DWORD [ebp-8], 99 ; (1)
mov DWORD [ebp-4], 97 ; (2)
mov eax, DWORD [ebp-8] ; (3)
sub eax, DWORD [ebp-4] ; (4)
jge L5 ; (5)
mov DWORD [ebp-76], 0 ; (6)
mov DWORD [ebp-12], 1 ; (7)
L3:
cmp DWORD [ebp-12], 15 ; (8)
jg L5 ; (9)
mov ecx, DWORD [ebp-12] ; (10)
mov edx, DWORD [ebp-4] ; (11)
mov eax, edx ; (12)
sar edx, 31 ; (13)
idiv ecx ; (14)
mov DWORD [ebp-76+ecx*4], eax ; (15)
add DWORD [ebp-12], 1 ; (16)
jmp L3 ; (17)
L5:
leave
ret
Flowchart
void f() {
int a[16];
int i, x = 99, y = 97; // 1
if (x < y) { // 2
a[0] = 0; // 3
for (i = 1; i < 16; ++i) { // 4
a[i] = y / i; // 5
}
}
}
Stack frame layout
array layout
Push onto stack
Pop off stack
Stack frame
int main() {int a = 1, b = 2, c;c = sum(a, b);return 0;
}
int sum(int x, int y) {int t = x + y;return t;
}
%include ‘io.inc’section .text
global CMAINCMAIN:mov DWORD [ebp-16],0x1 ; (1)mov DWORD [ebp-12],0x2 ; (2)mov eax,DWORD [ebp-12] ; (3)mov DWORD [esp+4],eax ; (4)mov eax,DWORD [ebp-16] ; (5)mov DWORD [esp],eax ; (6)call sum ; (7)mov DWORD [ebp-8],eax ; (8)
global sumsum:push ebp ; (9)mov ebp,esp ; (10)sub esp,0x10 ; (11)mov edx,DWORD [ebp+12] ; (12)mov eax,DWORD [ebp+8] ; (13)add eax,edx ; (14)mov DWORD [ebp-4],eax ; (15)mov eax,DWORD [ebp-4] ; (16)mov esp, ebp ; (17)pop ebp ; (18)ret ; (19)
long long f1(long long a, long long b) {long long c;c = a + b;return c;
}
; …mov eax, DWORD [ebp+16] ; (1)mov edx, DWORD [ebp+20] ; (2)add eax, DWORD [ebp+8] ; (3)adc edx, DWORD [ebp+12] ; (4)
; …
64-bit addition
64-bit addition
64-bit addition: data flow
long long f3(long long a, long long b) {long long c;c = a - b;return c;
}
; …mov eax, DWORD [ebp+8] ; (1)mov edx, DWORD [ebp+12] ; (2)sub eax, DWORD [ebp+16] ; (3)sbb edx, DWORD [ebp+20] ; (4)
; …
64-bit subtraction
64-bit subtraction: data flow
long long f2(long long a, long long b) {
long long c;c = a * b;return c;
}
globаl f2f2:
push ebpmov ebp, espsub esp, 8mov DWORD [esp], ebx ; (1)mov ecx, DWORD [ebp+20] ; (2)mov ebx, DWORD [ebp+8] ; (3)mov DWORD [esp+4], esi ; (4)mov eax, DWORD [ebp+12] ; (5)mov esi, DWORD [ebp+16] ; (6)imul ecx, ebx ; (7)imul eax, esi ; (8)add ecx, eax ; (9)mov eax, esi ;(10)mul ebx ;(11)mov ebx, DWORD [esp] ;(12)lea esi, [ecx+edx] ;(13)mov edx, esi ;(14)mov esi, DWORD [esp+4] ;(15)mov esp, ebppop ebpret
64-bit multiplication
64-bit multiplication: data flow
Contest #1: expression evaluation
• 7 word problems
• Solve 5 problems for grade «excellent»
• Submit via e-judge: http://earth.ispras.ru/cgi-bin/new-client?contest_id=150&locale_id=0
• Sample problem – «Watch out for overflow»
Contest #1: «Watch out for overflow»
A water tank is a rectangular parallelepiped and has dimensionsAxBxC decimeters. A pipe is connected to the tank. The pipe has athroughput of V liters per minute. Determine the number of minutesthe valve on the pipe has to be opened for so that the tank gets filledwith as much water as possible but without an overflow.The construction of the pipe and valve allows only the maximumthroughput, and the valve can be open only for a whole number ofminutes.The standard input contains four space-delimited numbers: A, B, C,and V. All numbers are positive integers and do not exceed 2*109.Print to the standard output the number of minutes for which thevalve is to be opened. It is guaranteed that the correct answer willnever exceed 2*109. Do not use conditional control and data transferinstructions.
Time limit: 1 secondMemory limit: 64 MB
Contest #1: e-judge
Structure field allocation
struct rec {int i; int j; int a[3]; int *p;
}…struct rec *x;x->j = x->i;
mov edx, dword [x] ; (1)mov eax, dword [edx] ; (2)mov dword [edx + 4], eax ; (3)
struct rec {int i; int j; int a[3]; int *p;
};
struct rec *x;int i;
&(r->a[i]);
mov edx, dword [i] ; (1)mov eax, dword [x] ; (2)lea eax, [eax + 4 * edx + 8] ; (3)
Structure field access
struct rec {int i; int j; int a[3]; int *p;
};
struct rec *r;
r->p = &r->a[r->i + r->j];
mov edx, dword [r] ; (1)mov eax, dword [edx + 4] ; (2)add eax, dword [edx] ; (3)lea eax, [edx + 4 * eax + 8] ; (4)mov dword [edx + 20], eax ; (5)
Structure field access
// (1) wrongstruct NODE_S {
struct NODE_S *left; struct NODE_S *right;double data;
};
// (2) not bedunion NODE_U {
struct {union NODE_U *left; union NODE_U *right;
} internal;double data;
};
// (3) correcttypedef enum {
N_LEAF, N_INTERNAL} nodetype_t;
struct NODE_T {nodetype_t type;union NODE_U {
struct {struct NODE_T *left; struct NODE_T *right;
} internal;double data;
} info;};
struct vs. union
unsigned float2bit(float f) {union {
float f;unsigned u;
} temp;temp.f = f;return temp.u;
}
unsigned copy(unsigned u) {return u;
}
global float2bitfloat2bit:
push ebpmov ebp, espmov eax, dword [ebp + 8]mov esp, ebppop ebpret
union vs. copy
Data Alignment
typedef struct {int i;char c;int j;
} trifield1; // (2)
typedef struct {int i;int j;char c;
} trifield2; // (3)
int pierce_arrow(int a, int b) {
int t = ~(a | b);return t;
}
section .textglobal pierce_arrowpierce_arrow:
push ebpmov ebp, espmov eax, DWORD [ebp+12] ; (1)or eax, DWORD [ebp+8] ; (2)not eax ; (3)pop ebpret
Logical Instructions
Shift left
Shift logical right
Shift arithmetic right
char upndown(char x) {return (x << 8) >> 8;
}
section .textglobal upndownupndown:
push ebpmov ebp, espmovsx eax, BYTE [ebp+8]sal eax, 8sar eax, 8pop ebpret
Shift: integer promotion
Rotate instructions
unsigned sha256_f1(unsigned x) {unsigned t;t = ((x >> 2) | (x << ((sizeof(x) << 3) - 2))); // (1)t ^= ((x >> 13) | (x << ((sizeof(x) << 3) - 13))); // (2)t ^= ((x >> 22) | (x << ((sizeof(x) << 3) - 22))); // (3)return t;
}
global sha256_f1sha256_f1:
push ebpmov ebp, espmov edx, DWORD [ebp+8] ; (1)pop ebp ; (2)mov eax, edx ; (3)mov ecx, edx ; (4)ror eax, 13 ; (5)ror ecx, 2 ; (6)xor eax, ecx ; (7)ror edx, 22 ; (8)xor eax, edx ; (9)ret
int arith(int x,int y,int z) {
int t1 = x + y;int t2 = z * 48;int t3 = t1 & 0xFFFF;int t4 = t2 * t3;return t4;
}
; …mov eax, dword [ebp + 16] ; (1)lea eax, [eax + 2 * eax] ; (2)sal eax, 4 ; (3)mov edx, dword [ebp + 12] ; (4)add edx, dword [ebp + 8] ; (5)and edx, 65535 ; (6)imul eax, edx ; (7); …
Special arithmetic
struct omg {int a : 3;int b : 5;int c : 2;unsigned cntr: 31;int sum : 8;
};
void f(struct omg *p) {p->cntr++; // 1p->b = (p->c << 3) | (p->a); // 2p->sum = p->a + p->b + p->c; // 3
}
section .textglobal ff:
; …mov esi, DWORD [ebp+8] ; loadmov eax, DWORD [esi+4] ; cntrlea edx, [eax+1] ; cntr++ and eax, -2147483648 ; maskand edx, 2147483647 ; maskor eax, edx ; mergemov DWORD [esi+4], eax ; store; …
Bit field
struct omg {int a : 3;int b : 5;int c : 2;unsigned cntr: 31;int sum : 8;
};
void f(struct omg *p) {p->cntr++; // 1p->b = (p->c << 3) | (p->a); // 2p->sum = p->a + p->b + p->c; // 3
}
section .textglobal ff:
; …movzx ebx, BYTE [esi+1] ; p->csal ebx, 6 ; <<sar bl, 3 ; 3movzx edx, BYTE [esi] ; mov eax, edx ; and edx, 7 ; sal eax, 5 ; sar al, 5 ; p->aor ebx, eax ; sal ebx, 3 ; or edx, ebx ; mov BYTE [esi], dl ; ; …
Bit field
struct omg {int a : 3;int b : 5;int c : 2;unsigned cntr: 31;int sum : 8;
};
void f(struct omg *p) {p->cntr++; // 1p->b = (p->c << 3) | (p->a); // 2p->sum = p->a + p->b + p->c; // 3
}
section .textglobal ff:
; …movzx ebx, BYTE [esi+1] ; p->csal ebx, 6 ; <<sar bl, 3 ; 3movzx edx, BYTE [esi] ; mov eax, edx ; and edx, 7 ; sal eax, 5 ; sar al, 5 ; p->aor ebx, eax ; sal ebx, 3 ; or edx, ebx ; mov BYTE [esi], dl ; ; …
Bit field
struct omg {int a : 3;int b : 5;int c : 2;unsigned cntr: 31;int sum : 8;
};
void f(struct omg *p) {p->cntr++; // 1p->b = (p->c << 3) | (p->a); // 2p->sum = p->a + p->b + p->c; // 3
}
section .textglobal ff:
; …movzx ebx, BYTE [esi+1] ; p->csal ebx, 6 ; <<sar bl, 3 ; 3movzx edx, BYTE [esi] ; mov eax, edx ; and edx, 7 ; sal eax, 5 ; sar al, 5 ; p->aor ebx, eax ; sal ebx, 3 ; or edx, ebx ; mov BYTE [esi], dl ; ; …
Bit field
struct omg {int a : 3;int b : 5;int c : 2;unsigned cntr: 31;int sum : 8;
};
void f(struct omg *p) {p->cntr++; // 1p->b = (p->c << 3) | (p->a); // 2p->sum = p->a + p->b + p->c; // 3
}
section .textglobal ff:
; …movzx ebx, BYTE [esi+1] ; p->csal ebx, 6 ; <<sar bl, 3 ; 3movzx edx, BYTE [esi] ; mov eax, edx ; and edx, 7 ; sal eax, 5 ; sar al, 5 ; p->aor ebx, eax ; sal ebx, 3 ; or edx, ebx ; mov BYTE [esi], dl ; ; …
Bit field
struct omg {int a : 3;int b : 5;int c : 2;unsigned cntr: 31;int sum : 8;
};
void f(struct omg *p) {p->cntr++; // 1p->b = (p->c << 3) | (p->a); // 2p->sum = p->a + p->b + p->c; // 3
}
section .textglobal ff:
; …movzx ebx, BYTE [esi+1] ; sal ebx, 6 ; sar bl, 6 ; p->cmovzx edx, BYTE [esi] ; sal edx, 5 ; sar dl, 5 ; p->amovzx ecx, BYTE [esi] ;sar cl, 3 ; p->badd ebx, edx ; add ebx, ecx ; mov BYTE [esi+8], bl ; pop ebx ; pop esi ; pop ebp ; ret ;
Bit field
Jcc Condition Description
JE ZF Equal / Zero
JNE ~ZF Not Equal / Not Zero
JS SF Negative
JNS ~SF Non-negative
JG ~(SF^OF)&~ZF Greater (signed)
JGE ~(SF^OF) Greater or Equal (signed)
JL (SF^OF) Less (signed)
JLE (SF^OF)|ZF Less or Equal (signed)
JA ~CF&~ZF Above (unsigned)
JB CF Below (unsigned)
int absdiff(int x, int y) {int result;if (x > y) {
result = x-y;} else {
result = y-x;}return result;
}
absdiff:push ebpmov ebp, espmov edx, dword [8 + ebp] ; (1)mov eax, dword [12 + ebp] ; (2)cmp edx, eax ; (3)jle .L6 ; (4)sub edx, eax ; (5)mov eax, edx ; (6)jmp .L7 ; (7)
.L6: ; (8)sub eax, edx ; (9)
.L7: ; (10)pop ebpret
int goto_ad(int x, int y) {int result;if (x <= y) goto Else;result = x-y;goto Exit;
Else:result = y-x;
Exit:return result;
}
absdiff:push ebpmov ebp, espmov edx, dword [8 + ebp] ; (1)mov eax, dword [12 + ebp] ; (2)cmp edx, eax ; (3)jle .L6 ; (4)sub edx, eax ; (5)mov eax, edx ; (6)jmp .L7 ; (7)
.L6: ; (8)sub eax, edx ; (9)
.L7: ; (10)pop ebpret
val = Test ? Then_Expr : Else_Expr;
val = x>y ? x-y : y-x;
nt = !Test;if (nt) goto Else;val = Then_Expr;goto Done;
Else:val = Else_Expr;
Done:. . .
tmp_val = Then_Expr;result = Else_Expr;t = Test;if (t) result = tmp_val;return result;
int absdiff(int x, int y) {int result;if (x > y) {
result = x-y;} else {
result = y-x;}return result;
}
absdiff:mov edx, edisub edx, esi ; tmp_val:edx = x-y mov eax, esisub eax, edi ; result:eax = y-xcmp edi, esi ; Compare x:ycmovg eax, edx ; If >, result:eax = tmp_val:edxret
x loaded in edi
y loaded in esi
int pcount_do(unsigned x) {int result = 0;do {result += x & 0x1;x >>= 1;
} while (x);return result;
}
int pcount_do(unsigned x){int result = 0;
loop:result += x & 0x1;x >>= 1;if (x)goto loop;
return result;}
int pcount_do(unsigned x){int result = 0;
loop:result += x & 0x1;x >>= 1;if (x)goto loop;
return result;}
mov ecx, 0 ; result = 0.L2: ; loop:
mov eax, edxand eax, 1 ; t = x & 1add ecx, eax ; result += tshr edx, 1 ; x >>= 1jne .L2 ; If !0, goto loop
• Register allocation:edx x
ecx result
int pcount_while(unsigned x) {int result = 0;while (x) {result += x & 0x1;x >>= 1;
}return result;
}
int pcount_do(unsigned x) {int result = 0;if (!x) goto done;
loop:result += x & 0x1;x >>= 1;if (x)goto loop;
done:return result;
}
int pcount_do(unsigned x) {int result = 0;
loop:if (!x) goto done;result += x & 0x1;x >>= 1;goto loop;
done:return result;
}
#define WSIZE 8*sizeof(int)
int pcount_for(unsigned x) {int i;int result = 0;for (i = 0; i < WSIZE; i++) {unsigned mask = 1 << i;result += (x & mask) != 0;
}return result;
}
#define WSIZE 8*sizeof(int)
int pcount_for(unsigned x) {int i;int result = 0;for (i = 0; i < WSIZE; i++) {unsigned mask = 1 << i;result += (x & mask) != 0;
}return result;
}
int pcount_for_gt(unsigned x) {int i;int result = 0;i = 0;if (!(i < WSIZE))goto done;
loop:{unsigned mask = 1 << i;result += (x & mask) != 0;
}i++;if (i < WSIZE)goto loop;
done:return result;
}
#define WSIZE 8*sizeof(int)
int pcount_for(unsigned x) {int i;int result = 0;for (i = 0; i < WSIZE; i++) {unsigned mask = 1 << i;result += (x & mask) != 0;
}return result;
}
int pcount_for_gt(unsigned x) {int i;int result = 0;i = 0;if (!(i < WSIZE))goto done;
loop:{unsigned mask = 1 << i;result += (x & mask) != 0;
}i++;if (i < WSIZE)goto loop;
done:return result;
}
int fib(int x) { // x >= 1int i;int predpred = 0;int pred = 1;int res = 1;x--;for (i = 0; i < x; i++) {
res = predpred + pred;predpred = pred;pred = res;
}return res;
}
fib:push ebpmov ebp, esppush ebx
mov ecx, dword [ebp + 8] ; xxor edx, edx ; predpredmov ebx, 1 ; predmov eax, 1 ; resdec ecx
jecxz .end.loop:
lea eax, [edx + ebx]mov edx, ebxmov ebx, eaxloop .loop
.end:pop ebxpop ebpret
int fib(int x) { // x >= 1int i;int predpred = 0;int pred = 1;int res = 1;x--;for (i = 0; i < x; i++) {
res = predpred + pred;predpred = pred;pred = res;
}return res;
}
fib:push ebpmov ebp, esppush ebx
mov ecx, dword [ebp + 8] ; xxor edx, edx ; predpredmov ebx, 1 ; predmov eax, 1 ; resdec ecx
jecxz .end.loop:
lea eax, [edx + ebx]mov edx, ebxmov ebx, eaxloop .loop
.end:pop ebxpop ebpret
int fib(int x) { // x >= 1int i;int predpred = 0;int pred = 1;int res = 1;x--;for (i = 0; i < x; i++) {
res = predpred + pred;predpred = pred;pred = res;
}return res;
}
fib:push ebpmov ebp, esppush ebx
mov ecx, dword [ebp + 8] ; xxor edx, edx ; predpredmov ebx, 1 ; predmov eax, 1 ; resdec ecx
jecxz .end.loop:
lea eax, [edx + ebx]mov edx, ebxmov ebx, eaxloop .loop
.end:pop ebxpop ebpret
int fib(int x) { // x >= 1int i;int predpred = 0;int pred = 1;int res = 1;x--;for (i = 0; i < x; i++) {
res = predpred + pred;predpred = pred;pred = res;
}return res;
}
fib:push ebpmov ebp, esppush ebx
mov ecx, dword [ebp + 8] ; xxor edx, edx ; predpredmov ebx, 1 ; predmov eax, 1 ; resdec ecx
jecxz .end.loop:
lea eax, [edx + ebx]mov edx, ebxmov ebx, eaxloop .loop
.end:pop ebxpop ebpret
•Integer values– Stored and processed in general purpose registers
– Signed/unsigned values
Intel ASM Bytes C
byte b 1 [unsigned] char
word w 2 [unsigned] short
double word d 4 [unsigned] int
quad word q 8 [unsigned] long long int
•Floating-point values– Stored and processed in special floating-point registers
Intel ASM Bytes C
Single d 4 float
Double q 8 double
• Arrays — layout in memoryT A[L];
– Array of elements of type T, array length is L
– Stored in a contiguous memory block of size L * sizeof(T) bytes
char string[12];
x x + 12
int val[5];
x x + 4 x + 8 x + 12 x + 16 x + 20
double a[3];
x + 24x x + 8 x + 16
char *p[3];
x x + 4 x + 8 x + 12
•Array element accessT A[L];
– Array of elements of type T, array length is L
– The identifier A can be used as a pointer to element 0. Pointer type is T*
• Reference Type Value
val[4] int 3
val int * x
val+1 int * x + 4
&val[2] int * x + 8
val[5] int ??
*(val+1) int 5
val + i int * x + 4 i
int val[5]; 1 5 2 1 3
x x + 4 x + 8 x + 12 x + 16 x + 20
• Declaration ―zip_dig cmu‖ is equivalent to ―int cmu[5]‖
• Arrays are laid out in contiguous memory blocks 20 bytes each
– Generally it is not guaranteed that individual arrays are laid out without gaps between them
#define ZLEN 5
typedef int zip_dig[ZLEN];
zip_dig cmu = { 1, 5, 2, 1, 3 };
zip_dig mit = { 0, 2, 1, 3, 9 };
zip_dig ucb = { 9, 4, 7, 2, 0 };
zip_dig cmu; 1 5 2 1 3
16 20 24 28 32 36
zip_dig mit; 0 2 1 3 9
36 40 44 48 52 56
zip_dig ucb; 9 4 7 2 0
56 60 64 68 72 76
The edx register
contains starting (base) array address
The eax register
contains element index
Element address isedx + 4 * eax
int get_digit (zip_dig z, int dig) {
return z[dig];
}
; edx = z
; eax = dig
mov eax, dword [edx+4*eax] # z[dig]
zip_dig cmu; 1 5 2 1 3
16 20 24 28 32 36
void zincr(zip_dig z) {int i;for (i = 0; i < ZLEN; i++)z[i]++;
}
; edx = zmov eax, 0 ; eax = i
.L4: ; loop:add dword [edx + 4 * eax], 1 ; z[i]++add eax, 1 ; i++cmp eax, 5 ; i vs. 5jne .L4 ; if (!=) goto loop
void zincr_p(zip_dig z) {int *zend = z+ZLEN;do {(*z)++;z++;
} while (z != zend); }
void zincr_v(zip_dig z) {void *vz = z;int i = 0;do {(*((int *) (vz+i)))++;i += ISIZE;
} while (i != ISIZE*ZLEN);}
; edx = z = vzmovl eax, 0 ; i = 0
.L8: ; loop:add dword [edx + eax], 1 ; Increment vz+iadd eax, 4 ; i += 4cmp eax, 20 ; i vs. 20jne .L8 ; if (!=) goto loop
• ―zip_dig pgh[4]‖ is equivalent to ―int pgh[4][5]‖
– Variable pgh: array of 4 elements contiguously stored in
memory
– Each element is an array of 5 int’s contiguously stored in
memory.
• Rows are laid out first (Row-Major)
#define PCOUNT 4
zip_dig pgh[PCOUNT] =
{{1, 5, 2, 0, 6},
{1, 5, 2, 1, 3 },
{1, 5, 2, 1, 7 },
{1, 5, 2, 2, 1 }};
zip_dig
pgh[4];
76 96 116 136 156
1 5 2 0 6 1 5 2 1 3 1 5 2 1 7 1 5 2 2 1
• Declaration
T A[R][C];
– 2D array of element of type T
– R rows, C columns
– Size of type T is K bytes
• Array size
– R * C * K bytes
• Layout in memory
– Rows first
A[0][0] A[0][C-1]
A[R-1][0]
• • •
• • • A[R-1][C-1]
•
•
•
•
•
•
int A[R][C];
• • •
A
[0]
[0]
A
[0]
[C-1]
• • •
A
[1]
[0]
A
[1]
[C-1]
• • •
A
[R-1]
[0]
A
[R-1]
[C-1]
• • •
4*R*C bytes
• • •
• Row access
– A[i] is an array of C elements
– Each element of type T requires K bytes
– Start address of row iA + i * (C * K)
• • •
A
[i]
[0]
A
[i]
[C-1]
A[i]
• • •
A
[R-1]
[0]
A
[R-1]
[C-1]
A[R-1]
• • •
A
• • •
A
[0]
[0]
A
[0]
[C-1]
A[0]
A+i*C*4 A+(R-1)*C*4
int A[R][C];
– pgh[index] is an array of 5 int’s
– Starting address is pgh+20*index
– Address is calculated and returned
– Address is calculated as pgh + 4*(index+4*index)
int *get_pgh_zip(int index){
return pgh[index];
}
; eax = index
lea eax, [eax + 4 * eax] ; 5 * index
lea eax, [pgh + 4 * eax] ; pgh + (20 * index)
#define PCOUNT 4
zip_dig pgh[PCOUNT] =
{{1, 5, 2, 0, 6},
{1, 5, 2, 1, 3 },
{1, 5, 2, 1, 7 },
{1, 5, 2, 2, 1 }};
• • •
• Array elements
– A[i][j] is element of type T, requiring K bytes
– Element address isA + i * (C * K) + j * K = A + (i * C + j)* K
• • • • • •
A
[i]
[j]
A[i]
• • •
A
[R-1]
[0]
A
[R-1]
[C-1]
A[R-1]
• • •
A
• • •
A
[0]
[0]
A
[0]
[C-1]
A[0]
A+i*C*4 A+(R-1)*C*4
int A[R][C];
A+i*C*4+j*4
– pgh[index][dig] has int type
– Address: pgh + 20*index + 4*dig =
= pgh + 4*(5*index + dig)
– Address is calculated aspgh + 4*((index+4*index)+dig)
int get_pgh_digit (int index, int dig) {
return pgh[index][dig];
}
mov eax, dword [ebp + 8] ; index
lea eax, [eax + 4 * eax] ; 5*index
add eax, dword [ebp + 12] ; 5*index+dig
mov eax, dword [pgh + 4 * eax] ; offset 4*(5*index+dig)
• The univ variable is an
array of 3 elements
• Each element is a 4-byte pointer
• Each pointer references an array of ints
zip_dig cmu = { 1, 5, 2, 1, 3 };
zip_dig mit = { 0, 2, 1, 3, 9 };
zip_dig ucb = { 9, 4, 7, 2, 0 };
#define UCOUNT 3
int *univ[UCOUNT] = {mit, cmu, ucb};
36160
16
56
164
168
univ
cmu
mit
ucb
1 5 2 1 3
16 20 24 28 32 36
0 2 1 3 9
36 40 44 48 52 56
9 4 7 2 0
56 60 64 68 72 76
– Access to element Mem[Mem[univ+4*index]+4*dig]
– Two memory reads are required
• First one obtains pointer to a one-dimensional array
• Second one fetches required element from the one-dimensional array
mov eax, dword [ebp + 8] ; index
mov edx, dword [univ + 4 * eax] ; p = univ[index]
mov eax, dword [ebp + 12] ; dig
mov eax, dword [edx + 4 * eax] ; p[dig]
int get_univ_digit (int index, int dig) {
return univ[index][dig];
}
int get_pgh_digit
(int index, int dig)
{
return pgh[index][dig];
}
int get_univ_digit
(int index, int dig)
{
return univ[index][dig];
}
Multiple dimension array Multiple level array
•Similar in C•Significant difference in assembly
Mem[pgh+20*index+4*dig] Mem[Mem[univ+4*index]+4*dig]
N x N matrix
• Fixed dimensions
– N is known at compile time
• Dynamic dimensions require explicit index calculation
– Traditional way to implement multiple dimension arrays
• Dynamic dimensions with implicit indexing
– Supported in fresh gcc versions
#define N 16
typedef int fix_matrix[N][N];
/* Get element a[i][j] */
int fix_ele
(fix_matrix a, int i, int j){
return a[i][j];
}
#define IDX(n, i, j) ((i)*(n)+(j))
/* Get element a[i][j] */
int vec_ele
(int n, int *a, int i, int j){
return a[IDX(n,i,j)];
}
/* Get element a[i][j] */
int var_ele
(int n, int a[n][n], int i, int j){
return a[i][j];
}
/* Retrieval of element a[i][j] */
int fix_ele(fix_matrix a, int i, int j) {
return a[i][j];
}
mov edx, dword [ebp + 12] ; i
sal edx, 6 ; i*64
mov eax, dword [ebp + 16] ; j
sal eax, 2 ; j*4
add eax, dword [ebp + 8] ; a + j*4
mov eax, dword [eax + edx] ; *(a + j*4 + i*64)
Element access Address A + i * (C * K) + j * K
C = 16, K = 4
16 X 16 matrix
n X n matrix
/* Retrieval of element a[i][j] */
int var_ele(int n, int a[n][n], int i, int j) {
return a[i][j];
}
mov eax, dword [ebp + 8] ; n
sal eax, 2 ; n*4
mov edx, eax ; n*4
imul edx, dword [ebp + 16] ; i*n*4
mov eax, dword [ebp + 20] ; j
sal eax, 2 ; j*4
add eax, dword [ebp + 12] ; a + j*4
mov eax, dword [eax + edx] ; *(a + j*4 + i*n*4)
Element access Address A + i * (C * K) + j * K
C = n, K = 4
Optimizing array element access
• Calculations
– Process all elements in column j
• Optimization
– Fetch individual elements of the column
#define N 16
typedef int fix_matrix[N][N];
/*
Fetch of array column j
*/
void fix_column
(fix_matrix a, int j, int *dest)
{
int i;
for (i = 0; i < N; i++)
dest[i] = a[i][j];
}
a jth column
• Optimization
– Calculate ajp = &a[i][j]
• Initial value is a + 4*j
• Step is 4*N
/* Fetch of array column j */
void fix_column
(fix_matrix a, int j, int *dest)
{
int i;
for (i = 0; i < N; i++)
dest[i] = a[i][j];
}
.L8: ; loop:
mov eax, dword [ecx] ; get *ajp
mov dword [ebx + 4 * edx], eax ; store in dest[i]
add edx, 1 ; i++
add ecx, 64 ; ajp += 4*N
cmp edx, 16 ; i vs. N
jne .L8 ; if !=, goto loop
Register Value
ecx ajp
ebx dest
edx i
Optimizing array element access
– Calculate ajp = &a[i][j]
• Initial value is a + 4*j
• Step is 4*n /* Fetch of array column j */
void var_column
(int n, int a[n][n],
int j, int *dest)
{
int i;
for (i = 0; i < n; i++)
dest[i] = a[i][j];
}
.L18: ; loop:
mov eax, dword [ecx] ; get *ajp
mov dword [edi + 4 * edx], eax ; store in dest[i]
add edx, 1 ; i++
add ecx, ebx ; ajp += 4*n
cmp esi, edx ; n vs. i
jg .L18 ; if (>) goto loop
Register Value
ecx ajp
edi dest
edx i
ebx 4*n
esi n
Optimizing array element access
– Change loop direction
• Exit loop on zero counter
• Negative step
• Initial pointer values change
• It is sufficient to compare only a single index against 0
/* Fetch of array column j */
void var_column
(int n, int a[n][n],
int j, int *dest)
{
int i;
for (i = n-1; i >=0; i--)
dest[i] = a[i][j];
}
.L18: ; loop:
mov eax, dword [ecx] ; get *ajp
mov dword [edi + 4 * edx], eax ; store in dest[i]
add edx, 1 ; i++
add ecx, ebx ; ajp += 4*n
cmp esi, edx ; n vs. i
jg .L18 ; if (>) goto loop
Optimizing array element access
/* Fetch of array column j */
void var_column
(int n, int a[n][n],
int j, int *dest)
{
int i;
dest--;
for (i = n; i != 0; i--)
dest[i] = a[i-1][j];
}
.L18: ; loop:
mov eax, dword [ecx] ; get *(ajp+…)
mov dword [edi + 4 * edx], eax ; store in dest[i]
sub ecx, ebx ; ajp -= 4*n
sub edx, 1 ; i--
jnz .L18 ; if (!=) goto loop
Optimizing array element access
Register Initial value
ecx a+4*n*(n-1)+4*j
edi dest – 4
edx n
ebx 4*n
esi unused now
Machine-dependent optimization
Contest #2:branches, loops, arrays
• 5 word problems• 2 reverse engineering problems • Solve any 5 problems for grade «excellent», but at least one
reverse engineering problem.• Submit via e-judge:
- http://earth.ispras.ru/cgi-bin/new-client?contest_id=151&locale_id=0- http://earth.ispras.ru/cgi-bin/new-client?contest_id=152&locale_id=0
• Sample word problem– «Local extrema»
• Sample reverse engineering problem– «R2»
Contest #2: «Local extrema»
Let us define local minimum of an integer sequence to be such anelement that is strictly less than both its neighbors. Let us define localmaximum of an integer sequence to be such an element that isstrictly greater than both its neighbors.
The standard input contains a non-negative integer N <= 500000followed by N 32-bit integers comprising the sequence.
Print to the standard output first the number m of local minimums inthe sequence followed by their indices. Then print the number M oflocal maximums followed by their indices. Indexing starts at 0. Firstand last sequence elements cannot be its local extrema.
Time limit: 1 secondMemory limit: 64 MB
Contest #2: «R2»
Given the following assembly language program, recover its semantics and express it as a C language program. The input is a 32-bit unsigned integer.
Time limit: 1 secondMemory limit: 64 MB
%include "io.inc"
SECTION .text
GLOBAL CMAINCMAIN:
GET_UDEC 4, EAXMOV EBX, EAXDEC EBXXOR EAX, EBXADD EAX, 1RCR EAX, 1PRINT_UDEC 4, EAXNEWLINEXOR EAX, EAXRET
CDECL
• Where parameters are placed– stack
• Parameter order– «reverse»: from stack «top» to «bottom»
• Which registers may be used by the function – EAX, EDX, ECX
• Whether the caller or the callee is responsible for cleaning up the stack on return
– Caller cleans
• Return values– EAX
– EAX:EDX
– In memory
CDECL
• Parameters placement
– Integer• Actual value
– Pointer -> Integer• Actual value
– Array -> Pointer• Reference
– Structure/union • Actual value
Function main
#include <stdio.h>
int v;void nullify(int argc, char* argv[]);
int main(int argc, char* argv[]) {nullify(argc, argv);return 0;
}
void nullify(int argc, char* argv[]) {}
CMAIN:lea ecx, [esp+4]and esp, -16push dword [ecx-4]push ebpmov ebp, esppush ecxsub esp, 20mov eax, dword [ecx+4]mov dword [esp+4], eaxmov eax, dword [ecx]mov dword [esp], eaxcall nullifymov eax, 0add esp, 20pop ecxpop ebplea esp, [ecx-4]ret
nullify:ret
Stack alignment
STDCALL
#include <stdio.h>
__attribute__((stdcall)) int sum(int x, int y);
int main() {int a = 1, b = 2, c;c = sum(a, b);printf("%d\n", c);return 0;
}
__attribute__((stdcall)) int sum(int x, int y) {
int t = x + y;return t;
}
sum:push ebpmov ebp, espsub esp, 16mov edx, DWORD [ebp+12]mov eax, DWORD [ebp+8]add eax, edxmov DWORD [ebp-4], eaxmov eax, DWORD [ebp-4]leaveret 8
STDCALL
#include <stdio.h>
__attribute__((stdcall)) int sum(int x, int y);
int main() {int a = 1, b = 2, c;c = sum(a, b);printf("%d\n", c);return 0;
}
__attribute__((stdcall)) int sum(int x, int y) {
int t = x + y;return t;
}
CMAIN:; … mov eax, DWORD [ebp-12]mov DWORD [esp+4], eaxmov eax, DWORD [ebp-16]mov DWORD [esp], eaxcall sumsub esp, 8mov DWORD [ebp-8], eax; …
FASTCALL
#include <stdio.h>
__attribute__((fastcall)) int sum(int x, int y);
int main() {int a = 1, b = 2, c;c = sum(a, b);printf("%d\n", c);return 0;
}
__attribute__((fastcall)) int sum(int x, int y) {
int t = x + y;return t;
}
CMAIN:; … mov edx, DWORD [ebp-12]mov ecx, DWORD [ebp-16]call summov DWORD [ebp-8], eax; …
sum:lea eax, [ecx + edx]ret
Omit frame pointer
void f(int x, int y) {int numerator =
(x + y) * (x - y);int denominator =
x * x + y * y;if (0 == denominator) {
denominator = 1;}return (100 * numerator) /
denominator;}
f:; setupsub esp, 8mov DWORD [esp+4], esimov esi, DWORD [esp+16]mov ecx, DWORD [esp+12]mov DWORD [esp], ebx; …
Register Value
esi y
ecx X
Saved Register
address
esi [esp + 4]
ebx [esp]
Omit frame pointer
void f(int x, int y) {int numerator =
(x + y) * (x - y);int denominator =
x * x + y * y;if (0 == denominator) {
denominator = 1;}return (100 * numerator) /
denominator;}
f:; … mov edx, esiimul edx, esi ; edx = y^2mov eax, ecximul eax, ecx ; eax = x^2mov ebx, edxadd ebx, eax ; ebx = x^2 + y^2jne .L2mov ebx, 1
.L2; …Register Value
esi y
ecx X
Omit frame pointer
void f(int x, int y) {int numerator =
(x + y) * (x - y);int denominator =
x * x + y * y;if (0 == denominator) {
denominator = 1;}return (100 * numerator) /
denominator;}
f:; …
.L2lea edx, [esi+ecx]sub ecx, esiimul edx, ecx; …
Register Value
esi y
ecx x
ebx x^2 + y^2
Omit frame pointer
void f(int x, int y) {int numerator =
(x + y) * (x - y);int denominator =
x * x + y * y;if (0 == denominator) {
denominator = 1;}return (100 * numerator) /
denominator;}
f:; … imul edx, edx, 100mov eax, edxsar edx, 31idiv ebx; …
Register Value
esi y
ecx x
ebx x^2 + y^2
edx (x + y) * (x - y)
Omit frame pointer
void f(int x, int y) {int numerator =
(x + y) * (x - y);int denominator =
x * x + y * y;if (0 == denominator) {
denominator = 1;}return (100 * numerator) /
denominator;}
f:; … ; finishmov esi, DWORD [esp+4]mov ebx, DWORD [esp]add esp, 8ret
Variable-length parameter list
• An ellipsis (...) are placed at the end of a parameter list.
• Data type– va_list
• Macro– va_start(va_list, last fixed param)
– va_arg(va_list, cast type)
– va_end(va_list)
Variable-length parameter list
#include <stdarg.h>
int average(int count, ...) {
va_list ap;
int j;
int sum = 0;
va_start(ap, count);
for (j=0; j<count; j++)
sum += va_arg(ap, int);
va_end(ap);
return sum/count;
}
Contest #3: function call
• 5 word problems• 2 reverse engineering problems • Solve any 5 problems for grade «excellent», but at least one
reverse engineering problem.• Submit via e-judge
- http://earth.ispras.ru/cgi-bin/new-client?contest_id=153&locale_id=0- http://earth.ispras.ru/cgi-bin/new-client?contest_id=154&locale_id=0
• Sample word problem– «GCD of Four»
• Sample reverse engineering problem– «R3»
Contest #3: «GCD of Four»
The standard input contains four integers eachgreater than zero and less than or equal to 109.Print to the standard output their greatestcommon divisor.
Time limit: 1 second
Memory limit: 64 MB
Contest #3: «R3»
Given the following assembly language program, recover its semantics and express it as a C language program.
The input contains a single integer in bounds 0 to 20, inclusive.
Time limit: 1 secondMemory limit: 64 MB
%include "io.inc"
SECTION .text
GLOBAL CMAINCMAIN:
GET_UDEC 4, EAXCALL FPRINT_UDEC 4, EAXNEWLINEXOR EAX, EAXRET
F:CMP EAX, 0JNZ .RECMOV EAX, 1RET
.REC:DEC EAXCALL FLEA EAX, [EAX + 2 * EAX]RET
Acknowledgement
We are grateful to Randal E. Bryant and David R. O'Hallaron for great textbook and other course materials we found on the site: http://www.cs.cmu.edu/~213/
Especially we used samples for the following themes:1. Loops: reduction to «if-goto» form.
2. Arrays: multidimensional, multilevel.
3. Loops: machine-independent code optimization.
4. Switch: jump table.
Final exam
• 10 problems
• Grading policy– Max 6 point for each problem: 60 points total
• Grade «excellent» >= 48 points (0.8)
• Grade «good» >= 36 points (0.6)
• Grade «poor» >= 24 points (0.4)
Sample problem #1
Fill in register AL value in hex and in decimal (signed and unsigned), and values of flags CF, OF, ZF and SF after execution of the following instructions.
(a) MOV AL, 137 ADD AL, 200
Answer: AL = _____ (hex), _____ (signed dec), _____ (unsigned dec),CF = __, OF = __, ZF = __, SF = __.
(b) MOV AL, -35SUB AL, 216
Answer: AL = _____ (hex), _____ (signed dec), _____ (unsigned dec)CF = __, OF = __, ZF = __, SF = __.
Sample problem #2
Assuming variable A containing the value 0xCAFE BABE, write out register AX value in hex after execution of the following instructions.
MOV AX, WORD [A + 2]
ADD AX, 3 ; Answer: AX = ______
Sample problem #3
Let register EAX contain a positive integer x <= 224. Write out two variants, both consisting of a single assembly instruction, that multiply x by 5. The result is to remain in EAX. Two variants are considered distinct if mnemonics of the used instructions are different.
Answer 1:
Answer 2:
Sample problem #4
Write a program in assembly equivalent to the following C code fragment.
short *px, *py; *px++ = --*py;
Sample problem #5
Write a program in assembly equivalent to the following C code fragment.
int x, y;
x /= -y;
Sample problem #6
Write a C code fragment equivalent to the following assembly fragment. Explain in your own words what the code does.
SECTION .textGLOBAL foofoo:
MOV ESI, DWORD [a]TEST ESI, ESIJE .1MOV ECX, DWORD [b]TEST ECX, ECXJE .1MOV EDX, DWORD [ESI]MOV EAX, EDXSAR EDX, 31IDIV ECXSUB DWORD [ESI], EDX
.1:XOR EAX, EAXRET
Sample problem #7
A C function f has the following body.
*p = d;return x - c;
This body corresponds to the following assembly code. Recover the function fprototype declaration.
MOVSX EDX, BYTE [EBP + 12]MOV EAX, DWORD [EBP + 16]MOV DWORD [EAX], EDXMOVSX EAX, WORD [EBP + 8]MOV EDX, DWORD [EBP + 20]SUB EDX, EAXMOV EAX, EDX
Sample problem #8
Write a function in assembly that calculates for given n and kthe number of combinations :k
nC
The function must correspond to the following C declaration and be implemented recursively.
unsigned int
combinations(unsigned int n, unsigned int k);
• , for all integers n, k > 0,
• , for all integers n,
• , for all integers k > 0.
k
n
k
n
k
n CCC 1
1
1
00 kC
10 nC
Sample problem #9
Write an assembly program that prints a sum of all odd elements of the principal diagonal of matrix
int A[N][N],
where N is a compile-time constant. No matrix input code is required.
Sample problem #10
Write a C code fragment equivalent to the following assembly fragment. Explain in your own words what the code does.
%include "io.inc"SECTION .textGLOBAL CMAINCMAIN:
GET_DEC 4, ECXMOV EBX, 1XOR EAX, EAX
.L:XOR EAX, EBXXOR EBX, EAXXOR EAX, EBXADD EBX, EAXLOOP .LPRINT_UDEC 4, EAXNEWLINEXOR EAX, EAXRET
top related