how does the single-global order in std::memory_order_seq_cst work?

I'm running my below C++ code on ARM Mac machine.

The idea is that I have two independent std::atomic<int> variables x & y. These variables are initially 0 and then are updated to 1.

now, there are multiple reader threads. Each thread reads the value of x and y. The idea is that if there is

"single-global-order"

then {x=0, y=1} and {x=1, y=0} cannot be seen together.

Now, to get some synchronization, i use a std::atomic<bool> ready. The ready flag is like a trigger. All the threads can start their respective logic only after ready is true

But surprisingly, I get the {0,1} and {1,0} result on my machine.

#include <atomic>
#include <iostream>
#include <thread>
#include <vector>

const char *machine()
{
    std::cout << "Compiled on/for: ";

#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
    return "x86-64 (64-bit Intel/AMD)";
#elif defined(__i386__) || defined(_M_IX86)
    return "x86 (32-bit Intel/AMD)";
#elif defined(__aarch64__) || defined(_M_ARM64)
    return "ARM64 (AArch64)";
#elif defined(__arm__) || defined(_M_ARM)
    return "ARM (32-bit)";
#elif defined(__powerpc__) || defined(__powerpc64__) || defined(_M_PPC)
    return "PowerPC";
#else
    return "Unknown architecture";
#endif
}

static constexpr std::size_t ITERATIONS = 1000000;
static constexpr std::size_t NUM_THREADS = 10;
using result = std::pair<int, int>;
std::vector<result> results;
std::atomic_bool ready;
std::atomic<int> x, y;

void initialize()
{
    results.clear();
    results.resize(NUM_THREADS, {0, 0});

    ready.store(false, std::memory_order_seq_cst);
    x.store(0, std::memory_order_seq_cst);
    y.store(0, std::memory_order_seq_cst);
}

void modify_x()
{
    while (!ready.load(std::memory_order_seq_cst))
        ;
    x.store(1, std::memory_order_seq_cst);
}

void modify_y()
{
    while (!ready.load(std::memory_order_seq_cst))
        ;
    y.store(1, std::memory_order_seq_cst);
}

void check(int i)
{
    while (!ready.load(std::memory_order_seq_cst))
        ;

    int xl = x.load(std::memory_order_seq_cst);
    int yl = y.load(std::memory_order_seq_cst);

    results[i] = {xl, yl};
}

bool check_results()
{
    // stores count of {0, 0}, {0, 1}, {1, 0}, {1, 1}
    int arr[4] = {0, 0, 0, 0};

    for (const auto &res : results)
    {
        if (res.first == 0 && res.second == 0)
            arr[0]++;
        else if (res.first == 0 && res.second == 1)
            arr[1]++;
        else if (res.first == 1 && res.second == 0)
            arr[2]++;
        else if (res.first == 1 && res.second == 1)
            arr[3]++;
    }

    bool failure = arr[1] != 0 && arr[2] != 0;

    if (failure)
        std::cout << "Results: {0, 0}: " << arr[0] << ", {0, 1}: " << arr[1]
                  << ", {1, 0}: " << arr[2] << ", {1, 1}: " << arr[3] << " failure = " << failure
                  << "\n";

    return failure;
}

bool iterate(std::size_t iter)
{

    initialize();
    std::vector<std::jthread> threads_;
    threads_.reserve(NUM_THREADS + 2);

    for (std::size_t i = 0; i < NUM_THREADS; ++i)
        threads_.emplace_back(check, i);

    threads_.emplace_back(modify_y);
    threads_.emplace_back(modify_x);

    ready.store(1, std::memory_order_seq_cst);

    for (std::size_t i = 0; i < NUM_THREADS + 2; ++i)
        threads_[i].join();
    return check_results();
}

int main()
{
    std::cout << "Compiled on/for: " << machine() << "\n";
    std::cout << "Testing sequential consistency semantics\n";

    std::size_t total_failures = 0;

    for (std::size_t i = 0; i < ITERATIONS; ++i)
        total_failures += iterate(i);

    std::cout << "Total failed iterations: " << total_failures << " out of " << ITERATIONS
              << "\n";
    return 0;
}

the run command and the output are

g++ rough2.cpp -std=c++20 -o prog && ./prog
Compiled on/for: Compiled on/for: ARM64 (AArch64)
Testing sequential consistency semantics
Results: {0, 0}: 4, {0, 1}: 1, {1, 0}: 1, {1, 1}: 4 failure = 1
Results: {0, 0}: 4, {0, 1}: 1, {1, 0}: 1, {1, 1}: 4 failure = 1
Results: {0, 0}: 5, {0, 1}: 1, {1, 0}: 1, {1, 1}: 3 failure = 1
Results: {0, 0}: 4, {0, 1}: 1, {1, 0}: 1, {1, 1}: 4 failure = 1
Results: {0, 0}: 3, {0, 1}: 1, {1, 0}: 1, {1, 1}: 5 failure = 1
Results: {0, 0}: 2, {0, 1}: 1, {1, 0}: 2, {1, 1}: 5 failure = 1
Results: {0, 0}: 3, {0, 1}: 1, {1, 0}: 1, {1, 1}: 5 failure = 1
Results: {0, 0}: 5, {0, 1}: 1, {1, 0}: 1, {1, 1}: 3 failure = 1
Results: {0, 0}: 4, {0, 1}: 1, {1, 0}: 1, {1, 1}: 4 failure = 1
Results: {0, 0}: 4, {0, 1}: 1, {1, 0}: 2, {1, 1}: 3 failure = 1
Results: {0, 0}: 2, {0, 1}: 1, {1, 0}: 1, {1, 1}: 6 failure = 1
Results: {0, 0}: 5, {0, 1}: 1, {1, 0}: 1, {1, 1}: 3 failure = 1
Results: {0, 0}: 3, {0, 1}: 1, {1, 0}: 1, {1, 1}: 5 failure = 1
Results: {0, 0}: 3, {0, 1}: 2, {1, 0}: 1, {1, 1}: 4 failure = 1
Results: {0, 0}: 4, {0, 1}: 1, {1, 0}: 1, {1, 1}: 4 failure = 1
Total failed iterations: 15 out of 1000000

I expect this to happen with std::memory_order_acquire and std::memory_order_release, but not with std::memory_order_seq_cst

Solution

The idea is that if there is "single-global-order" then {x=0, y=1} and {x=1, y=0} cannot be seen together.

This statement is false.

Proof

Simplifying OP's code, let's assume we have 4 concurrent threads:

modify_x running:

x.store(1); // #1

modify_y running:

y.store(1); // #1

reader_1 and reader_2 running:

int xl = x.load(); // #1
int yl = y.load(); // #2

If these instructions are executed in the following total order:

/* reader_1   #1 */ int x1 = x.load(); // reader_1 sees x == 0
/* modify_x   #1 */ x.store(1);
/* reader_2   #1 */ int x1 = x.load(); // reader_2 sees x == 1
/* reader_2   #2 */ int yl = y.load(); // reader_2 sees y == 0
/* modify_y   #1 */ y.store(1);
/* reader_1   #2 */ int yl = y.load(); // reader_1 sees y == 1

Then reader_1 will have x1 == 0 && y1 == 1, and reader_2 will have x1 == 1 && y1 == 0.