If we compile mystrlen.c
, we get a binary file. If we inspect the contents of that binary file with objdump -d
we find the following bytes and their corresponding machine code meaning in assembly.
mystrlen.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <strlen1>:
0: 55 push %rbp
1: 48 89 e5 mov %rsp,%rbp
4: 48 c7 c0 ff ff ff ff mov $0xffffffffffffffff,%rax
b: 80 7c 07 01 00 cmpb $0x0,0x1(%rdi,%rax,1)
10: 48 8d 40 01 lea 0x1(%rax),%rax
14: 75 f5 jne b <strlen1+0xb>
16: 5d pop %rbp
17: c3 ret
0000000000000018 <strlen2>:
18: 55 push %rbp
19: 48 89 e5 mov %rsp,%rbp
1c: 80 3f 00 cmpb $0x0,(%rdi)
1f: 74 12 je 33 <strlen2+0x1b>
21: 31 c9 xor %ecx,%ecx
23: 48 8d 41 01 lea 0x1(%rcx),%rax
27: 80 7c 0f 01 00 cmpb $0x0,0x1(%rdi,%rcx,1)
2c: 48 89 c1 mov %rax,%rcx
2f: 75 f2 jne 23 <strlen2+0xb>
31: eb 02 jmp 35 <strlen2+0x1d>
33: 31 c0 xor %eax,%eax
35: 5d pop %rbp
36: c3 ret
0000000000000037 <strlen3>:
37: 55 push %rbp
38: 48 89 e5 mov %rsp,%rbp
3b: 48 c7 c0 ff ff ff ff mov $0xffffffffffffffff,%rax
42: 80 7c 07 01 00 cmpb $0x0,0x1(%rdi,%rax,1)
47: 48 8d 40 01 lea 0x1(%rax),%rax
4b: 75 f5 jne 42 <strlen3+0xb>
4d: 5d pop %rbp
4e: c3 ret
000000000000004f <main>:
4f: 55 push %rbp
50: 48 89 e5 mov %rsp,%rbp
53: bf 00 00 00 00 mov $0x0,%edi
58: be 00 00 00 00 mov $0x0,%esi
5d: ba 12 00 00 00 mov $0x12,%edx
62: 31 c0 xor %eax,%eax
64: e8 00 00 00 00 call 69 <main+0x1a>
69: bf 00 00 00 00 mov $0x0,%edi
6e: be 00 00 00 00 mov $0x0,%esi
73: ba 12 00 00 00 mov $0x12,%edx
78: 31 c0 xor %eax,%eax
7a: e8 00 00 00 00 call 7f <main+0x30>
7f: bf 00 00 00 00 mov $0x0,%edi
84: be 00 00 00 00 mov $0x0,%esi
89: ba 12 00 00 00 mov $0x12,%edx
8e: 31 c0 xor %eax,%eax
90: e8 00 00 00 00 call 95 <main+0x46>
95: bf 00 00 00 00 mov $0x0,%edi
9a: be 00 00 00 00 mov $0x0,%esi
9f: ba 12 00 00 00 mov $0x12,%edx
a4: 31 c0 xor %eax,%eax
a6: e8 00 00 00 00 call ab <main+0x5c>
ab: 31 c0 xor %eax,%eax
ad: 5d pop %rbp
ae: c3 ret
Comments:
%
and have strange names like %rdi
and %ecx
left over from the 1970s when they expected people to code directly in assembly and gave the registers names that suggested a recommended purpose for each.immediatesin machine code, have different syntax when they are values themselves, like
$0x0
, or being used as an address, like 33
.%rsp
is special: it’s the stack pointer and is modified by instructions like call
, ret
, push
, and pop
0x1(%rdi,%rax,1)
which does some adding and some shifting to create the addresscmpb
that compares the subtraction of two values to 0, and then a conditional jump like jne
that only jumps if the last comparison was (in this case) n
ot e
qual to zero.xor %exc,%ecx
to zero out the cx register or the complicated lea
instruction to perform addition because these require fewer bytes than other versions of the same operation.mystrlen.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <strlen1>:
0: e3a01000 mov r1, #0
4: e7d02001 ldrb r2, [r0, r1]
8: e2811001 add r1, r1, #1
c: e3520000 cmp r2, #0
10: 1afffffb bne 4 <strlen1+0x4>
14: e2410001 sub r0, r1, #1
18: e12fff1e bx lr
0000001c <strlen2>:
1c: e5d01000 ldrb r1, [r0]
20: e3510000 cmp r1, #0
24: 03a00000 moveq r0, #0
28: 012fff1e bxeq lr
2c: e2801001 add r1, r0, #1
30: e3a00000 mov r0, #0
34: e7d12000 ldrb r2, [r1, r0]
38: e2800001 add r0, r0, #1
3c: e3520000 cmp r2, #0
40: 1afffffb bne 34 <strlen2+0x18>
44: e12fff1e bx lr
00000048 <strlen3>:
48: e3a01000 mov r1, #0
4c: e7d02001 ldrb r2, [r0, r1]
50: e2811001 add r1, r1, #1
54: e3520000 cmp r2, #0
58: 1afffffb bne 4c <strlen3+0x4>
5c: e2410001 sub r0, r1, #1
60: e12fff1e bx lr
00000064 <main>:
64: e92d4c10 push {r4, sl, fp, lr}
68: e28db008 add fp, sp, #8
6c: e59f404c ldr r4, [pc, #76] @ c0 <main+0x5c>
70: e59f0044 ldr r0, [pc, #68] @ bc <main+0x58>
74: e3a02012 mov r2, #18
78: e1a01004 mov r1, r4
7c: ebfffffe bl 0 <printf>
80: e59f003c ldr r0, [pc, #60] @ c4 <main+0x60>
84: e1a01004 mov r1, r4
88: e3a02012 mov r2, #18
8c: ebfffffe bl 0 <printf>
90: e59f0030 ldr r0, [pc, #48] @ c8 <main+0x64>
94: e1a01004 mov r1, r4
98: e3a02012 mov r2, #18
9c: ebfffffe bl 0 <printf>
a0: e59f0024 ldr r0, [pc, #36] @ cc <main+0x68>
a4: e1a01004 mov r1, r4
a8: e3a02012 mov r2, #18
ac: ebfffffe bl 0 <printf>
b0: e3a00000 mov r0, #0
b4: e8bd4c10 pop {r4, sl, fp, lr}
b8: e12fff1e bx lr
bc: 00000013 andeq r0, r0, r3, lsl r0
c0: 00000000 andeq r0, r0, r0
c4: 00000027 andeq r0, r0, r7, lsr #32
c8: 0000003c andeq r0, r0, ip, lsr r0
cc: 00000051 andeq r0, r0, r1, asr r0
Comments:
r0
, r1
and so on; plus a few special-purpose ones that some instructions like fp
, sp
, and so on, that are manipulated by some instructions like push
and pop
.immediatesin machine code, have different syntax when they are values themselves or the address of data, like
#0
; or being used as an address of code, like 34
.ldr
, which can do some arithmetic in the address.cmp
or most math instructions like add
that compares their results to 0, and then a conditional modfier on any instruction like ldrb
for a conditional load (only if the comparison was b
elow 0) or bne
for a conditional jump (only branch if the comparison was n
ot e
qual to zero).