You are not logged in.
I was wondering if GCC would optimize a loop I have (by "unrolling" it). The code needs to be as fast as possible and it's also somewhat tricky, which makes me not want to trust optimizers.
The idea is to squeeze 8 uint8_t's into a uint64_t:
uint64_t pcode = 0;
for (int8_t it=56; it > -1; it -= 8) {
pcode |= data[7 - (it/8)] << it;
}
And here's the unrolled version (more verbose, reluctant to go with this):
/* different var names here because I've actually used this with the rest of my code, but the idea is there */
uint64_t pcode = 0;
pcode |= rstr.buf[rstr.it] << 56;
pcode |= rstr.buf[++rstr.it] << 48;
pcode |= rstr.buf[++rstr.it] << 40;
pcode |= rstr.buf[++rstr.it] << 32;
pcode |= rstr.buf[++rstr.it] << 24;
pcode |= rstr.buf[++rstr.it] << 16;
pcode |= rstr.buf[++rstr.it] << 8;
pcode |= rstr.buf[++rstr.it];
div curl F = 0
Offline
foo.c
#include <stdint.h>
int main() {
uint8_t* data;
uint64_t pcode = 0;
int8_t it;
for (it=56; it > -1; it -= 8) {
pcode |= data[7 - (it/8)] << it;
}
return (int) pcode;
}
gcc -S -c foo.c -O3
.file "foo.c"
.text
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB0:
.cfi_startproc
xorl %eax, %eax
movl $56, %ecx
movzbl (%rax), %esi
movzbl 1(%rax), %edx
sall %cl, %esi
movl $48, %ecx
sall %cl, %edx
movl $40, %ecx
orl %esi, %edx
movzbl 2(%rax), %esi
sall %cl, %esi
movl $32, %ecx
orl %edx, %esi
movzbl 3(%rax), %edx
sall %cl, %edx
movzbl 4(%rax), %ecx
orl %esi, %edx
sall $24, %ecx
orl %edx, %ecx
movslq %ecx,%rdx
movzbl 5(%rax), %ecx
sall $16, %ecx
movslq %ecx,%rcx
orq %rcx, %rdx
movzbl 6(%rax), %ecx
movzbl 7(%rax), %eax
sall $8, %ecx
movslq %ecx,%rcx
orq %rcx, %rdx
orq %rdx, %rax
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (GNU) 4.4.0 20090630 (prerelease)"
.section .note.GNU-stack,"",@progbits
does that answer your question?
Offline
I got the assembly out of both like you suggested, here's the diff:
$ diff t1.s t2.s
1c1
< .file "t1.c"
---
> .file "t2.c"
15c15
< movzbl 2(%ebp), %eax
---
> movzbl 0(%ebp), %eax
17,19c17
< orl %edx, %eax
< movzbl 0(%ebp), %edx
< sall $16, %edx
---
> sall $16, %eax
21a20,22
> sall $8, %edx
> orl %eax, %edx
> movzbl 2(%ebp), %eax
25d25
< sall $8, %edx
and t1.s for reference (t1.c contains the manually "unrolled" loop, t2.c has the shorter for loop):
.file "t1.c"
.text
.p2align 4,,15
.globl main
.type main, @function
main:
leal 4(%esp), %ecx
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
movl %esp, %ebp
pushl %ecx
subl $16, %esp
movzbl -1(%ebp), %edx
movzbl 2(%ebp), %eax
sall $24, %edx
orl %edx, %eax
movzbl 0(%ebp), %edx
sall $16, %edx
orl %edx, %eax
movzbl 1(%ebp), %edx
addl $16, %esp
popl %ecx
popl %ebp
sall $8, %edx
orl %edx, %eax
leal -4(%ecx), %esp
ret
.size main, .-main
.section .note.GNU-stack,"",@progbits
I don't know any assembly, so I'd be grateful if someone could interpret
div curl F = 0
Offline
I am not very proficient in asm, but there seem to be very little difference between the two. did you compile tc1 with -O0?
Also, one thing is sure: the loop was unrolled (there is no jump or whatsoever).
When optimizing code, you should also consider in which context it is called. If you write a function around your critical code and that function does not get inlined, it's likely that the time spent in the function is way shorter than the time spent calling it.
Offline
ok, thanks!
(for the record, I compiled both with "-O3", no other flags)
div curl F = 0
Offline
By the way, if your uint8 array is well ordered, you should be able to do something like that:
pcode = *((uint64_t*) data);
Offline
I'd love to, but the array is massive and it's on the heap
div curl F = 0
Offline
I was wondering if GCC would optimize a loop I have (by "unrolling" it). The code needs to be as fast as possible and it's also somewhat tricky, which makes me not want to trust optimizers.
The idea is to squeeze 8 uint8_t's into a uint64_t:
uint64_t pcode = 0; for (int8_t it=56; it > -1; it -= 8) { pcode |= data[7 - (it/8)] << it; }
And here's the unrolled version (more verbose, reluctant to go with this):
/* different var names here because I've actually used this with the rest of my code, but the idea is there */ uint64_t pcode = 0; pcode |= rstr.buf[rstr.it] << 56; pcode |= rstr.buf[++rstr.it] << 48; pcode |= rstr.buf[++rstr.it] << 40; pcode |= rstr.buf[++rstr.it] << 32; pcode |= rstr.buf[++rstr.it] << 24; pcode |= rstr.buf[++rstr.it] << 16; pcode |= rstr.buf[++rstr.it] << 8; pcode |= rstr.buf[++rstr.it];
I guess that you don't want to do the unrolled verison because it's to static?
Fist of just because you hide (it/8) it doesn't make it smaller. Depending on the cpu / arch it might be a bad idea. Doing a ++ on data might be better / easier to opt. If you want something that both unrolled and not static you might go for a duff device but that will add a bit of overhead but a speedup at when the loop is big.
Offline