possible optimization for speed-critical C code?

vkumar · 2009-07-05 22:49:06

I was wondering if GCC would optimize a loop I have (by "unrolling" it). The code needs to be as fast as possible and it's also somewhat tricky, which makes me not want to trust optimizers.

The idea is to squeeze 8 uint8_t's into a uint64_t:

uint64_t pcode = 0;
for (int8_t it=56; it > -1; it -= 8) {
  pcode |= data[7 - (it/8)] << it;
}

And here's the unrolled version (more verbose, reluctant to go with this):

/* different var names here because I've actually used this with the rest of my code, but the idea is there */
uint64_t pcode = 0;
pcode |= rstr.buf[rstr.it] << 56;
pcode |= rstr.buf[++rstr.it] << 48;
pcode |= rstr.buf[++rstr.it] << 40;
pcode |= rstr.buf[++rstr.it] << 32;
pcode |= rstr.buf[++rstr.it] << 24;
pcode |= rstr.buf[++rstr.it] << 16;
pcode |= rstr.buf[++rstr.it] << 8;
pcode |= rstr.buf[++rstr.it];

benob · 2009-07-05 23:47:03

foo.c

#include <stdint.h>
int main() {
    uint8_t* data;
    uint64_t pcode = 0;
    int8_t it;
    for (it=56; it > -1; it -= 8) {
          pcode |= data[7 - (it/8)] << it;
    }
    return (int) pcode;
}

gcc -S -c foo.c -O3

    .file   "foo.c"
    .text
    .p2align 4,,15
.globl main
    .type   main, @function
main:
.LFB0:
    .cfi_startproc
    xorl    %eax, %eax
    movl    $56, %ecx
    movzbl  (%rax), %esi
    movzbl  1(%rax), %edx
    sall    %cl, %esi
    movl    $48, %ecx
    sall    %cl, %edx
    movl    $40, %ecx
    orl %esi, %edx
    movzbl  2(%rax), %esi
    sall    %cl, %esi
    movl    $32, %ecx
    orl %edx, %esi
    movzbl  3(%rax), %edx
    sall    %cl, %edx
    movzbl  4(%rax), %ecx
    orl %esi, %edx
    sall    $24, %ecx
    orl %edx, %ecx
    movslq  %ecx,%rdx
    movzbl  5(%rax), %ecx
    sall    $16, %ecx
    movslq  %ecx,%rcx
    orq %rcx, %rdx
    movzbl  6(%rax), %ecx
    movzbl  7(%rax), %eax
    sall    $8, %ecx
    movslq  %ecx,%rcx
    orq %rcx, %rdx
    orq %rdx, %rax
    ret
    .cfi_endproc
.LFE0:
    .size   main, .-main
    .ident  "GCC: (GNU) 4.4.0 20090630 (prerelease)"
    .section    .note.GNU-stack,"",@progbits

does that answer your question?

vkumar · 2009-07-06 00:58:31

I got the assembly out of both like you suggested, here's the diff:

$ diff t1.s t2.s 
1c1
<     .file    "t1.c"
---
>     .file    "t2.c"
15c15
<     movzbl    2(%ebp), %eax
---
>     movzbl    0(%ebp), %eax
17,19c17
<     orl    %edx, %eax
<     movzbl    0(%ebp), %edx
<     sall    $16, %edx
---
>     sall    $16, %eax
21a20,22
>     sall    $8, %edx
>     orl    %eax, %edx
>     movzbl    2(%ebp), %eax
25d25
<     sall    $8, %edx

and t1.s for reference (t1.c contains the manually "unrolled" loop, t2.c has the shorter for loop):

    .file    "t1.c"
    .text
    .p2align 4,,15
.globl main
    .type    main, @function
main:
    leal    4(%esp), %ecx
    andl    $-16, %esp
    pushl    -4(%ecx)
    pushl    %ebp
    movl    %esp, %ebp
    pushl    %ecx
    subl    $16, %esp
    movzbl    -1(%ebp), %edx
    movzbl    2(%ebp), %eax
    sall    $24, %edx
    orl    %edx, %eax
    movzbl    0(%ebp), %edx
    sall    $16, %edx
    orl    %edx, %eax
    movzbl    1(%ebp), %edx
    addl    $16, %esp
    popl    %ecx
    popl    %ebp
    sall    $8, %edx
    orl    %edx, %eax
    leal    -4(%ecx), %esp
    ret
    .size    main, .-main
    .section    .note.GNU-stack,"",@progbits

I don't know any assembly, so I'd be grateful if someone could interpret

benob · 2009-07-07 00:57:27

I am not very proficient in asm, but there seem to be very little difference between the two. did you compile tc1 with -O0?
Also, one thing is sure: the loop was unrolled (there is no jump or whatsoever).

When optimizing code, you should also consider in which context it is called. If you write a function around your critical code and that function does not get inlined, it's likely that the time spent in the function is way shorter than the time spent calling it.

vkumar · 2009-07-07 02:33:56

ok, thanks!

(for the record, I compiled both with "-O3", no other flags)

benob · 2009-07-07 17:01:16

By the way, if your uint8 array is well ordered, you should be able to do something like that:

pcode = *((uint64_t*) data);

vkumar · 2009-07-09 03:07:19

I'd love to, but the array is massive and it's on the heap

dagle · 2009-07-09 13:04:37

vkumar wrote:

I was wondering if GCC would optimize a loop I have (by "unrolling" it). The code needs to be as fast as possible and it's also somewhat tricky, which makes me not want to trust optimizers.
The idea is to squeeze 8 uint8_t's into a uint64_t:
uint64_t pcode = 0;
for (int8_t it=56; it > -1; it -= 8) {
  pcode |= data[7 - (it/8)] << it;
}
And here's the unrolled version (more verbose, reluctant to go with this):
/* different var names here because I've actually used this with the rest of my code, but the idea is there */
uint64_t pcode = 0;
pcode |= rstr.buf[rstr.it] << 56;
pcode |= rstr.buf[++rstr.it] << 48;
pcode |= rstr.buf[++rstr.it] << 40;
pcode |= rstr.buf[++rstr.it] << 32;
pcode |= rstr.buf[++rstr.it] << 24;
pcode |= rstr.buf[++rstr.it] << 16;
pcode |= rstr.buf[++rstr.it] << 8;
pcode |= rstr.buf[++rstr.it];

I guess that you don't want to do the unrolled verison because it's to static?

Fist of just because you hide (it/8) it doesn't make it smaller. Depending on the cpu / arch it might be a bad idea. Doing a ++ on data might be better / easier to opt. If you want something that both unrolled and not static you might go for a duff device but that will add a bit of overhead but a speedup at when the loop is big.

Arch Linux

#1 2009-07-05 22:49:06

possible optimization for speed-critical C code?

#2 2009-07-05 23:47:03

Re: possible optimization for speed-critical C code?

#3 2009-07-06 00:58:31

Re: possible optimization for speed-critical C code?

#4 2009-07-07 00:57:27

Re: possible optimization for speed-critical C code?

#5 2009-07-07 02:33:56

Re: possible optimization for speed-critical C code?

#6 2009-07-07 17:01:16

Re: possible optimization for speed-critical C code?

#7 2009-07-09 03:07:19

Re: possible optimization for speed-critical C code?

#8 2009-07-09 13:04:37

Re: possible optimization for speed-critical C code?

Board footer