=============== #include <stdint.h> uint32_t load_uint32_be(uint8_t\* p)...

===============

#include <stdint.h>

uint32_t load_uint32_be(uint8_t* p) { return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; }

uint32_t load_uint32_le(uint8_t* p) { return (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; }

===============

gcc -O3 -fomit-frame-pointer -S bo.c

===============

load_uint32_be: movl 4(%esp), %edx

        movzbl  (%edx), %eax

        movzbl  1(%edx), %ecx

        sall    $24, %eax

        sall    $16, %ecx

        orl     %ecx, %eax

        movzbl  3(%edx), %ecx

        movzbl  2(%edx), %edx

        orl     %ecx, %eax

        sall    $8, %edx

        orl     %edx, %eax

        ret

load_uint32_le: movl 4(%esp), %edx

        movzbl  3(%edx), %eax

        movzbl  2(%edx), %ecx

        sall    $24, %eax

        sall    $16, %ecx

        orl     %ecx, %eax

        movzbl  (%edx), %ecx

        movzbl  1(%edx), %edx

        orl     %ecx, %eax

        sall    $8, %edx

        orl     %edx, %eax

        ret

GCC doesn't merge the 4 byte-level reads into one 32-bit read. Thus, it does cause some performance penalty. The true impact is probably quite low, but it does exist on x86.

It is true however, that GCC will take a series of bit ops and produce a 'bswap' instruction on x86, but that requires a full 32-bit word to start with.