Home > OS >  Avoid volatile bit-field assignment expression reading or writing memory several times
Avoid volatile bit-field assignment expression reading or writing memory several times

Time:01-06

I want to use volatile bit-field struct to set hardware register like following code

union foo {
    uint32_t value;
    struct {
        uint32_t x : 1;
        uint32_t y : 3;
        uint32_t z : 28;
    };
};
union foo f = {0};
int main()
{
    volatile union foo *f_ptr = &f;
    //union foo tmp;
    *f_ptr =  (union foo) {
        .x = 1,
        .y = 7,
        .z = 10,
    };
    //*f_ptr = tmp;
    return 0;
}

However, the compiler will make it to STR, LDR HW register several times. It is a terrible things that it will trigger hardware to work at once when the register is writed.

main:
    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    @ link register save eliminated.
    movw    r3, #:lower16:.LANCHOR0
    movs    r0, #0
    movt    r3, #:upper16:.LANCHOR0
    ldr r2, [r3]
    orr r2, r2, #1
    str r2, [r3]
    ldr r2, [r3]
    orr r2, r2, #14
    str r2, [r3]
    ldr r2, [r3]
    and r2, r2, #15
    orr r2, r2, #160
    str r2, [r3]
    bx  lr
    .size   main, .-main
    .global f
    .bss
    .align  2

My gcc version is : arm-linux-gnueabi-gcc (Linaro GCC 4.9-2017.01) 4.9.4 and build with -O2 optimation


I have tried to use the local variable to resolve this problem

union foo {
    uint32_t value;
    struct {
        uint32_t x : 1;
        uint32_t y : 3;
        uint32_t z : 28;
    };
};
union foo f = {0};
int main()
{
    volatile union foo *f_ptr = &f;
    union foo tmp;
    tmp =  (union foo) {
        .x = 1,
        .y = 7,
        .z = 10,
    };
    *f_ptr = tmp;
    return 0;
}

Well, it will not STR to HW register several times

main:
    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    @ link register save eliminated.
    movs    r1, #10
    movs    r2, #15
    movw    r3, #:lower16:.LANCHOR0
    bfi r2, r1, #4, #28
    movt    r3, #:upper16:.LANCHOR0
    movs    r0, #0
    str r2, [r3]
    bx  lr
    .size   main, .-main
    .global f
    .bss
    .align  2

I think it is still not a good idea to use local variable, considering the limitation of binary size for embedded system.

Is there any way to handle this problem without using local variable?


CodePudding user response:

There seems to be no need for bitfields in your program: using uint16_t types should make it simpler and generate better code:

#include <stdint.h>

union foo {
    uint32_t value;
    struct {
        uint16_t x;
        uint16_t y;
    };
};
extern union foo f;

int main() {
    volatile union foo *f_ptr = &f;
    *f_ptr = (union foo) {
        .x = 10,
        .y = 20,
    };
    return 0;
}

Code generated by arm gcc 4.6.4 linux, as produced by Godbolt Compiler Explorer:

main:
        ldr     r3, .L2
        mov     r0, #0
        mov     r2, #10
        str     r0, [r3, #0]
        strh    r2, [r3, #0]    @ movhi
        mov     r2, #20
        strh    r2, [r3, #2]    @ movhi
        bx      lr
.L2:
        .word   f

The code is much simpler but still performs a redundant store for the 32-bit value: str r0, [r3, #0] when storing the union in one shot.

Investigating this, I tried different approaches and got surprising results: using struct or union assignment generates code potentially inappropriate for memory mapped hardware register, storing the fields element-wise seems required to generate the proper code:

#include <stdint.h>

union foo {
    uint32_t value;
    struct {
        uint16_t x;
        uint16_t y;
    };
};
extern union foo f;

void store_1(void) {
    volatile union foo *f_ptr = &f;
    *f_ptr = (union foo) {
        .x = 10,
        .y = 20,
    };
}
void store_2(void) {
    volatile union foo *f_ptr = &f;
    union foo bar = { .x = 10, .y = 20, };
    *f_ptr = bar;
}
void store_3(void) {
    volatile union foo *f_ptr = &f;
    f_ptr->x = 10;
    f_ptr->y = 20;
}
int main() {
    return 0;
}

Furthermore, removing the uint32_t value; generates calls to memcpy for the struct assignment version.

Code generated by arm gcc 4.6.4 linux:

store_1:
        ldr     r3, .L2
        mov     r2, #0
        str     r2, [r3, #0]
        mov     r2, #10
        strh    r2, [r3, #0]    @ movhi
        mov     r2, #20
        strh    r2, [r3, #2]    @ movhi
        bx      lr
.L2:
        .word   f
store_2:
        ldr     r3, .L5
        ldr     r2, .L5 4
        str     r2, [r3, #0]
        bx      lr
.L5:
        .word   f
        .word   1310730
store_3:
        ldr     r3, .L8
        mov     r2, #10
        strh    r2, [r3, #0]    @ movhi
        mov     r2, #20
        strh    r2, [r3, #2]    @ movhi
        bx      lr
.L8:
        .word   f
main:
        mov     r0, #0
        bx      lr

Further investigations seems to link the issue to the use of volatile union foo *f_ptr = &f; instead of tagging the union members as volatile:

#include <stdint.h>

union foo {
    uint32_t value;
    struct {
        volatile uint16_t x;
        volatile uint16_t y;
    };
};
extern union foo f;

void store_1(void) {
    union foo *f_ptr = &f;
    *f_ptr = (union foo) {
        .x = 10,
        .y = 20,
    };
    *f_ptr = (union foo) {
        .x = 10,
        .y = 20,
    };
}
void store_2(void) {
    union foo *f_ptr = &f;
    union foo bar = { .x = 10, .y = 20, };
    *f_ptr = bar;
    *f_ptr = bar;
}
void store_3(void) {
    union foo *f_ptr = &f;
    f_ptr->x = 10;
    f_ptr->y = 20;
    f_ptr->x = 10;
    f_ptr->y = 20;
}

Code generated:

store_1:
        ldr     r3, .L2
        mov     r1, #10
        mov     r2, #20
        strh    r1, [r3, #0]    @ movhi
        strh    r2, [r3, #2]    @ movhi
        strh    r1, [r3, #0]    @ movhi
        strh    r2, [r3, #2]    @ movhi
        bx      lr
.L2:
        .word   f
store_2:
        ldr     r3, .L5
        ldr     r2, .L5 4
        str     r2, [r3, #0]
        bx      lr
.L5:
        .word   f
        .word   1310730
store_3:
        ldr     r3, .L8
        mov     r1, #10
        mov     r2, #20
        strh    r1, [r3, #0]    @ movhi
        strh    r2, [r3, #2]    @ movhi
        strh    r1, [r3, #0]    @ movhi
        strh    r2, [r3, #2]    @ movhi
        bx      lr
.L8:
        .word   f

As you can see, assigning the union does not generate appropriate code in store_2, even when qualifying the value member as volatile too.

Using the C99 compound literals seems to work correctly in store_1, generating redundant stores when the fields are qualified as volatile.

Yet I would recommend assigning the fields explicitly as in store_3, making the assignment order explicit too. If instead you want to generate a single 32-bit store, assuming it is correct for your hardware, Aki Suihkonen suggested an interesting approach.

The original problem is a side effect of how the compiler generates code for assigning compound literals to structures and unions: it first initializes the destination to all bits zero, then stores the members specified in the compound literal explicitly. Redundant stores are eliminated unless the destination is volatile qualified`. I don't believe this behavior is mandated by the C Standard, so it may well be compiler specific.

CodePudding user response:

You can force the aggregate union to be written in one go with

f_ptr->value = (union foo) {
    .x = 10,
    .y = 20,
}.value;

// produced asm
mov     r1, #10
orr     r1, r1, #1310720
str     r1, [r0]
bx      lr

CodePudding user response:

I think this is a bug in GCC. Per discussion below, you might consider using:

f_ptr->value =  (union foo) {
        .x = 1,
        .y = 7,
        .z = 10,
    } .value;

By the C standard, the code a compiler generates for a program may not access a volatile object when the original C code nominally does not access the object. The code *f_ptr = (union foo) { .x = 1, .y = 7, .z = 10, }; is a single assignment to *f_ptr. So we would expect this to generate a single store to *f_ptr; generating two stores is a violation of the standard’s requirements.

We could consider an explanation for this to be that GCC is treating the aggregate (the union and/or the structure within it) as several objects, each individually volatile, rather than one aggregated volatile object.1 But, if this were so, then it ought to generate separate 16-bit strh instructions for the parts (per the original example code, which had 16-bit parts), not the 32-bit str instructions we see.

While using a local variable appears to work around the issue, I would not rely on that, because the assignment of the compound literal above is semantically equivalent, so the cause of why GCC generates broken assembly code for one sequence of code and not the other is unclear. With different circumstances (such as additional or modified code in the function or other variations that might affect optimization), GCC might generate broken code with the local variable too.

What I would do is avoid using an aggregate for the volatile object. The hardware register is, presumably, physically more like a 32-bit unsigned integer than like a structure of bit-fields (even though semantically it is defined with bit-fields). So I would define the register as volatile uint32_t and use that type when assigning values to it. Those values could be prepared with bit shifts or structures with bit-fields or whatever other method you prefer.

It should not be necessary to avoid using local variables, as the optimizer should effectively eliminate them. However, if you wish to neither change the register definition nor use local variables, an alternative is the code I opened with:

f_ptr->value =  (union foo) {
        .x = 1,
        .y = 7,
        .z = 10,
    } .value;

That prepares the value to be stored but then assigns it using the uint32_t member of the union rather than using the whole union, and testing with ARM GCC 4.6.4 on Compiler Explorer (the closest match I could find on Compiler Explorer to what you are using) suggests it generates a single store with minimal code:

main:
        ldr     r3, .L2
        mov     r2, #175
        str     r2, [r3, #0]
        mov     r0, #0
        bx      lr
.L2:
        .word   .LANCHOR0
.LANCHOR0 = .   0
f:

Footnote

1 I would consider this to a bug too, as the C standard does not make provision for applying the volatile qualifier on a union or structure declaration as applying to the members rather than to the whole aggregate. For arrays, it does say that qualifiers apply to the elements, rather than the whole array (C 2018 6.7.3 10). It has no such wording for unions or structures.

  •  Tags:  
  • Related