I have a problem with g++14.2 and previous versions.
When I compile my program with g++ -march=native -g
, the binary creates a SIGSEGV only in Intel CPUs with AVX instruction set. Not in my ARM64 nor in a Core2Duo Intel.
When I compile my program with g++ -g
, the binary runs normally in any platform.
When I compile my program with g++ -march=native -O3
, the binary runs normally in any platform (optimized out code?).
The place where the SIGSEGV appears, it seems correct. It is just before a call to a member function from another member function.
The assembly instruction where the SIGSEGV appears is:
=> 0x00007ff6074ea2b3 <+67>: vmovdqa %ymm0,-0x40(%rbp)
but it seems that (rbp-0x40) is not aligned to 32 bytes:
rbp 0x5fec10
I think that this is compiler specific. But I tried to apply alignas(32)
to my definitions like:
alignas(32) band_matrix<const unsigned short*, 10, 2> res2{.....}
but either that way, rbp
is not aligned to 32 bytes.
I make a minification of my program, which many times works (probably when rbp is correctly aligned to 32 bytes) and many fails (no output)
#include <cstdint>
#include <iostream>
template<typename It>
struct container_view
{
typedef typename It::reference reference;
typedef std::remove_reference_t<reference> value_type;
typedef const reference const_reference;
constexpr container_view(It it, size_t s) noexcept : it(it), s(s) {}
constexpr It begin() const noexcept { return it; }
constexpr It end() const noexcept { return it + size(); }
constexpr reference operator[](size_t i) noexcept { return *(it + i); }
/// Size of container.
constexpr size_t size() const noexcept { return s; }
private:
It it;
size_t s;
};
template<typename T, typename S = const size_t>
struct shifted_vector
{
typedef std::remove_reference_t<T> vector_type; ///< The type of vector.
T vec; ///< DenseVector of sequential non-zero elements.
S offset; ///< Offset of first non-zero element.
};
struct band_matrix_jump_iterator
{
typedef short value_type;
typedef short* pointer;
typedef short& reference;
private:
size_t b; // matrix's bandwidth
size_t v; // matrix's column
size_t e; // matrix's column's element's index.
pointer p; // Pointer to current element.
public:
constexpr band_matrix_jump_iterator(pointer p, size_t b, size_t v, size_t e) noexcept
: b{b}, v{v}, e{e}, p{p} {}
constexpr band_matrix_jump_iterator &operator++() noexcept { ++e; ++p; return *this; }
constexpr band_matrix_jump_iterator &operator+=(ptrdiff_t i) noexcept { e += i; p += i; return *this; }
constexpr band_matrix_jump_iterator operator+(ptrdiff_t i) const noexcept
{ band_matrix_jump_iterator t(*this); t += i; return t; }
constexpr reference operator*() const noexcept { return *p; }
constexpr bool operator==(const band_matrix_jump_iterator &it) const noexcept { return e == it.e; }
};
struct band_matrix
{
typedef short value_type;
typedef value_type* pointer;
constexpr size_t columns() const noexcept { return 10; }
size_t b;
pointer elements;
//---------------------------------------------------------------- TYPEDEFS OF ITERATORS AND VECTORS
typedef band_matrix_jump_iterator column_iterator;
typedef shifted_vector<container_view<column_iterator>, const uint8_t> column_vector;
//--------------------------------------------------------------------------- CONSTRUCTORS / ASSIGNS
constexpr band_matrix(size_t b) : b{b}, elements(new value_type[10]) {}
constexpr ~band_matrix() noexcept { delete[] elements; }
//------------------------------------------ GET ROWS, COLUMNS, DIAGONAL AND CORRESPONDING ITERATORS
/// Return a beginning iterator to column \c i vector.
constexpr column_iterator column_begin(size_t i) noexcept
{ return column_iterator(elements + i, 0, i, 0); }
/// Return a vector view of column \c i.
constexpr column_vector column(size_t i) noexcept
{ return {typename column_vector::vector_type{column_begin(i), 1}, (uint8_t) i}; }
};
using namespace std;
int main()
{
band_matrix m(0);
for (size_t i = 0; i < 10; ++i) m.elements[i] = i;
// checking columns
for (size_t i = 0; i < m.columns(); ++i)
{
auto v = m.column(i);
cout << (int) v.offset << ": { ";
for (auto i : v.vec) cout << i << " ";
cout << "}\n";
}
}
Is this a g++ bug? my code (above) bug?
Any suggestions? (of course the first suggestion is not to use -march=native
)
Update
My CPU is Skylake 2016 and I have a laptop with Ryzen 3, and with:
-march=sandybridge -g
it works fine-march=native -g
I have SIGSEGV-march=skylake -g
I have SIGSEGV-march=haswell -g
I have SIGSEGV-mavx2 -mfma -mbmi -mbmi2 -mmovbe -mlzcnt -mpopcnt -mrdrnd -mf16c -g
which -I think- emulates -march=haswell -g
it works fine.-g
but I not place an -O3
SIGSEGV appears too as appears with -g
, which seems that -O3
just optimize out some code.Update:
Compiling with clang++ works fine. This probably makes it a g++ bug.
My g++ (problematic) and my clang++ (working ok!) versions are:
Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=C:/Program\ Files/mingw64/bin/../libexec/gcc/x86_64-w64-mingw32/14.2.0/lto-wrapper.exe
OFFLOAD_TARGET_NAMES=nvptx-none
Target: x86_64-w64-mingw32
Configured with: ../configure --prefix=/R/winlibs_staging_ucrt64/inst_gcc-14.2.0/share/gcc --build=x86_64-w64-mingw32 --host=x86_64-w64-mingw32 --enable-offload-targets=nvptx-none --with-pkgversion='MinGW-W64 x86_64-ucrt-posix-seh, built by Brecht Sanders, r2' --with-tune=generic --enable-checking=release --enable-threads=posix --disable-sjlj-exceptions --disable-libunwind-exceptions --disable-serial-configure --disable-bootstrap --enable-host-shared --enable-plugin --disable-default-ssp --disable-rpath --disable-libstdcxx-debug --disable-version-specific-runtime-libs --with-stabs --disable-symvers --enable-languages=c,c++,fortran,lto,objc,obj-c++ --disable-gold --disable-nls --disable-stage1-checking --disable-win32-registry --disable-multilib --enable-ld --enable-libquadmath --enable-libada --enable-libssp --enable-libstdcxx --enable-lto --enable-fully-dynamic-string --enable-libgomp --enable-graphite --enable-mingw-wildcard --enable-libstdcxx-time --enable-libstdcxx-pch --with-mpc=/c/Prog/winlibs_staging_ucrt/custombuilt64 --with-mpfr=/c/Prog/winlibs_staging_ucrt/custombuilt64 --with-gmp=/c/Prog/winlibs_staging_ucrt/custombuilt64 --with-isl=/c/Prog/winlibs_staging_ucrt/custombuilt64 --disable-libstdcxx-backtrace --enable-install-libiberty --enable-__cxa_atexit --without-included-gettext --with-diagnostics-color=auto --enable-clocale=generic --with-libiconv --with-system-zlib --with-build-sysroot=/R/winlibs_staging_ucrt64/gcc-14.2.0/build_mingw/mingw-w64 CFLAGS='-D__USE_MINGW_ANSI_STDIO=0 -I/c/Prog/winlibs_staging_ucrt/custombuilt64/include/libdl-win32 -march=nocona -msahf -mtune=generic -O2 -Wno-error=format' CXXFLAGS='-D__USE_MINGW_ANSI_STDIO=0 -Wno-int-conversion -march=nocona -msahf -mtune=generic -O2' LDFLAGS='-pthread -Wl,--no-insert-timestamp -Wl,--dynamicbase -Wl,--high-entropy-va -Wl,--nxcompat -Wl,--tsaware' LD=/c/Prog/winlibs_staging_ucrt/custombuilt64/share/binutils/bin/ld.exe
Thread model: posix
Supported LTO compression algorithms: zlib zstd
gcc version 14.2.0 (MinGW-W64 x86_64-ucrt-posix-seh, built by Brecht Sanders, r2)
(built by Brecht Sanders, r2) clang version 19.1.1
Target: x86_64-w64-windows-gnu
Thread model: posix
InstalledDir: C:/Program Files/mingw64/bin
I think this is GCC bug 54412, outstanding since 2012. It's specific to MinGW.
I reproduced this on godbolt using the MinGW GCC 13.1.0 compiler with -std=c++20 -march=x86-64-v4
. You can't execute the code on godbolt, but at line 182 of the asm you can see a vmovdqa [rbp-96], ymm0
with no associated code to align the frame pointer (it's only guaranteed by the ABI to have 16-byte alignment).
The bug report mentions some possible workarounds. One is -Wa,-muse-unaligned-vector-move
which is supposed to tell the assembler to convert all aligned move instructions to unaligned. Another is to change the container_view
constructor to take its it
argument by const reference. Or, of course, -mno-avx2
.