Post

Replies

Boosts

Views

Activity

Erreur 34 311
hi i dont know if i post in a good section. i have crash without message in console only error 34 311 what does this code mean? erreur -311 smFHBlkDispErr : an error occurred during _sDisposePtr (suppression du bloc FHeader). erreur -34 dskFulErr : full disk or is this something else? thank
3
0
536
Oct ’24
Why ARM NEON is not faster than ARM ACLE ?
hello, i wrote a method with Neon for accelerate my code.. I don't have the expected result only simples instructions and, or, shift... on 64bits no gain speed. nada :-( i dont understand... how programm this method for speed? original code : unsigned long long RXBitBoard::get_legal_moves(const unsigned long long p_discs, const unsigned long long o_discs) { const unsigned long long inner_o_discs = o_discs & 0x7E7E7E7E7E7E7E7EULL; /* direction W */ unsigned long long flipped = (p_discs >> 1) & inner_o_discs; flipped |= (flipped >> 1) & inner_o_discs; unsigned long long adjacent_o_discs = inner_o_discs & (inner_o_discs >> 1); flipped |= (flipped >> 2) & adjacent_o_discs; flipped |= (flipped >> 2) & adjacent_o_discs; unsigned long long legals = flipped >> 1; // /* direction _E*/ // flipped = (p_discs << 1) & inner_o_discs; // flipped |= (flipped << 1) & inner_o_discs; // // adjacent_o_discs = inner_o_discs & (inner_o_discs << 1); // // flipped |= (flipped << 2) & adjacent_o_discs; // flipped |= (flipped << 2) & adjacent_o_discs; // // legals |= flipped << 1; // trick /* direction _E */ flipped = (p_discs << 1); legals |= ((flipped + inner_o_discs) & ~flipped); /* direction S */ flipped = (p_discs >> 8) & o_discs; flipped |= (flipped >> 8) & o_discs; adjacent_o_discs = o_discs & (o_discs >> 8); flipped |= (flipped >> 16) & adjacent_o_discs; flipped |= (flipped >> 16) & adjacent_o_discs; legals |= flipped >> 8; /* direction N */ flipped = (p_discs << 8) & o_discs; flipped |= (flipped << 8) & o_discs; adjacent_o_discs = o_discs & (o_discs << 8); flipped |= (flipped << 16) & adjacent_o_discs; flipped |= (flipped << 16) & adjacent_o_discs; legals |= flipped << 8; /* direction NE */ flipped = (p_discs >> 7) & inner_o_discs; flipped |= (flipped >> 7) & inner_o_discs; adjacent_o_discs = inner_o_discs & (inner_o_discs >> 7); flipped |= (flipped >> 14) & adjacent_o_discs; flipped |= (flipped >> 14) & adjacent_o_discs; legals |= flipped >> 7; /* direction SW */ flipped = (p_discs << 7) & inner_o_discs; flipped |= (flipped << 7) & inner_o_discs; adjacent_o_discs = inner_o_discs & (inner_o_discs << 7); flipped |= (flipped << 14) & adjacent_o_discs; flipped |= (flipped << 14) & adjacent_o_discs; legals |= flipped << 7; /* direction NW */ flipped = (p_discs >> 9) & inner_o_discs; flipped |= (flipped >> 9) & inner_o_discs; adjacent_o_discs = inner_o_discs & (inner_o_discs >> 9); flipped |= (flipped >> 18) & adjacent_o_discs; flipped |= (flipped >> 18) & adjacent_o_discs; legals |= flipped >> 9; /* direction SE */ flipped = (p_discs << 9) & inner_o_discs; flipped |= (flipped << 9) & inner_o_discs; adjacent_o_discs = inner_o_discs & (inner_o_discs << 9); flipped |= (flipped << 18) & adjacent_o_discs; flipped |= (flipped << 18) & adjacent_o_discs; legals |= flipped << 9; //Removes existing discs legals &= ~(p_discs | o_discs); return legals; } my neon code const uint64x2_t pp_discs = vdupq_n_u64(p_discs); const uint64x2_t oo_discs = vdupq_n_u64(o_discs); const uint64x2_t inner_oo_discs = vdupq_n_u64(o_discs & 0x7E7E7E7E7E7E7E7EULL); //horizontals directions -1, +1 static const int64x2_t shift_1 = {-1, 1}; static const int64x2_t shift_2 = {-2, 2}; uint64x2_t flipped = vandq_u64(vshlq_u64(pp_discs, shift_1), inner_oo_discs); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_1), inner_oo_discs)); uint64x2_t adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_1)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs)); uint64x2_t legals = vshlq_u64(flipped, shift_1); //verticals directions -8 , +8 static const int64x2_t shift_8 = {-8, 8}; static const int64x2_t shift_16 = {-16, 16}; flipped = vandq_u64(vshlq_u64(pp_discs, shift_8), oo_discs); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_8), oo_discs)); adjacent_oo_discs = vandq_u64(oo_discs, vshlq_u64(oo_discs, shift_8)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs)); legals = vorrq_u64(legals, vshlq_u64(flipped, shift_8)); //diagonals directions -7 , +7 static const int64x2_t shift_7 = {-7, 7}; static const int64x2_t shift_14 = {-14, 14}; flipped = vandq_u64(vshlq_u64(pp_discs, shift_7), inner_oo_discs); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_7), inner_oo_discs)); adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_7)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs)); legals = vorrq_u64(legals, vshlq_u64(flipped, shift_7)); //diagonals directions -9 , +9 static const int64x2_t shift_9 = {-9, 9}; static const int64x2_t shift_18 = {-18, 18}; flipped = vandq_u64(vshlq_u64(pp_discs, shift_9), inner_oo_discs); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_9), inner_oo_discs)); adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_9)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs)); legals = vorrq_u64(legals, vshlq_u64(flipped, shift_9)); return ((vgetq_lane_u64(legals, 0) | vgetq_lane_u64(legals, 1)) & ~(p_discs | o_discs)); } i even wrote a interleave version I can expect a speed gain, no?
3
0
630
Oct ’24
MPS Kernel and Sparse Matrix
hello, Do you have any information on the handling of sparse matrix with MPS and PyTorch? release date? ...
Replies
0
Boosts
0
Views
510
Activity
Dec ’25
cross compilation
hi i want create a commande line tool for arm64 et intel x86-64 Xcode 16 macbook pro M3 my setting : and but my application compile only for arm64 who can explain to me ?
Replies
5
Boosts
0
Views
985
Activity
Nov ’24
When will ARM SVE be available on APPLE chips?
hi Question is in the title.. do we already know the length of the vectors? best regards
Replies
1
Boosts
0
Views
763
Activity
Oct ’24
is it possible to choose your processor (P or E) to run your program?
hello, I would like to run benchmarks on p-cores vs e-cores, I suspect a difference of more than 50% is it possible to choose your processor (P or E) to run your program? nice day
Replies
2
Boosts
0
Views
422
Activity
Oct ’24
Erreur 34 311
hi i dont know if i post in a good section. i have crash without message in console only error 34 311 what does this code mean? erreur -311 smFHBlkDispErr : an error occurred during _sDisposePtr (suppression du bloc FHeader). erreur -34 dskFulErr : full disk or is this something else? thank
Replies
3
Boosts
0
Views
536
Activity
Oct ’24
CPU-Z For mac
Do you know any software that indicates the frequency of the cores in real time? the macbook pro adapts the frequency of the cores according to the load/temperature it seems to me thank
Replies
0
Boosts
0
Views
2.0k
Activity
Oct ’24
Why ARM NEON is not faster than ARM ACLE ?
hello, i wrote a method with Neon for accelerate my code.. I don't have the expected result only simples instructions and, or, shift... on 64bits no gain speed. nada :-( i dont understand... how programm this method for speed? original code : unsigned long long RXBitBoard::get_legal_moves(const unsigned long long p_discs, const unsigned long long o_discs) { const unsigned long long inner_o_discs = o_discs & 0x7E7E7E7E7E7E7E7EULL; /* direction W */ unsigned long long flipped = (p_discs >> 1) & inner_o_discs; flipped |= (flipped >> 1) & inner_o_discs; unsigned long long adjacent_o_discs = inner_o_discs & (inner_o_discs >> 1); flipped |= (flipped >> 2) & adjacent_o_discs; flipped |= (flipped >> 2) & adjacent_o_discs; unsigned long long legals = flipped >> 1; // /* direction _E*/ // flipped = (p_discs << 1) & inner_o_discs; // flipped |= (flipped << 1) & inner_o_discs; // // adjacent_o_discs = inner_o_discs & (inner_o_discs << 1); // // flipped |= (flipped << 2) & adjacent_o_discs; // flipped |= (flipped << 2) & adjacent_o_discs; // // legals |= flipped << 1; // trick /* direction _E */ flipped = (p_discs << 1); legals |= ((flipped + inner_o_discs) & ~flipped); /* direction S */ flipped = (p_discs >> 8) & o_discs; flipped |= (flipped >> 8) & o_discs; adjacent_o_discs = o_discs & (o_discs >> 8); flipped |= (flipped >> 16) & adjacent_o_discs; flipped |= (flipped >> 16) & adjacent_o_discs; legals |= flipped >> 8; /* direction N */ flipped = (p_discs << 8) & o_discs; flipped |= (flipped << 8) & o_discs; adjacent_o_discs = o_discs & (o_discs << 8); flipped |= (flipped << 16) & adjacent_o_discs; flipped |= (flipped << 16) & adjacent_o_discs; legals |= flipped << 8; /* direction NE */ flipped = (p_discs >> 7) & inner_o_discs; flipped |= (flipped >> 7) & inner_o_discs; adjacent_o_discs = inner_o_discs & (inner_o_discs >> 7); flipped |= (flipped >> 14) & adjacent_o_discs; flipped |= (flipped >> 14) & adjacent_o_discs; legals |= flipped >> 7; /* direction SW */ flipped = (p_discs << 7) & inner_o_discs; flipped |= (flipped << 7) & inner_o_discs; adjacent_o_discs = inner_o_discs & (inner_o_discs << 7); flipped |= (flipped << 14) & adjacent_o_discs; flipped |= (flipped << 14) & adjacent_o_discs; legals |= flipped << 7; /* direction NW */ flipped = (p_discs >> 9) & inner_o_discs; flipped |= (flipped >> 9) & inner_o_discs; adjacent_o_discs = inner_o_discs & (inner_o_discs >> 9); flipped |= (flipped >> 18) & adjacent_o_discs; flipped |= (flipped >> 18) & adjacent_o_discs; legals |= flipped >> 9; /* direction SE */ flipped = (p_discs << 9) & inner_o_discs; flipped |= (flipped << 9) & inner_o_discs; adjacent_o_discs = inner_o_discs & (inner_o_discs << 9); flipped |= (flipped << 18) & adjacent_o_discs; flipped |= (flipped << 18) & adjacent_o_discs; legals |= flipped << 9; //Removes existing discs legals &= ~(p_discs | o_discs); return legals; } my neon code const uint64x2_t pp_discs = vdupq_n_u64(p_discs); const uint64x2_t oo_discs = vdupq_n_u64(o_discs); const uint64x2_t inner_oo_discs = vdupq_n_u64(o_discs & 0x7E7E7E7E7E7E7E7EULL); //horizontals directions -1, +1 static const int64x2_t shift_1 = {-1, 1}; static const int64x2_t shift_2 = {-2, 2}; uint64x2_t flipped = vandq_u64(vshlq_u64(pp_discs, shift_1), inner_oo_discs); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_1), inner_oo_discs)); uint64x2_t adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_1)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs)); uint64x2_t legals = vshlq_u64(flipped, shift_1); //verticals directions -8 , +8 static const int64x2_t shift_8 = {-8, 8}; static const int64x2_t shift_16 = {-16, 16}; flipped = vandq_u64(vshlq_u64(pp_discs, shift_8), oo_discs); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_8), oo_discs)); adjacent_oo_discs = vandq_u64(oo_discs, vshlq_u64(oo_discs, shift_8)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs)); legals = vorrq_u64(legals, vshlq_u64(flipped, shift_8)); //diagonals directions -7 , +7 static const int64x2_t shift_7 = {-7, 7}; static const int64x2_t shift_14 = {-14, 14}; flipped = vandq_u64(vshlq_u64(pp_discs, shift_7), inner_oo_discs); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_7), inner_oo_discs)); adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_7)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs)); legals = vorrq_u64(legals, vshlq_u64(flipped, shift_7)); //diagonals directions -9 , +9 static const int64x2_t shift_9 = {-9, 9}; static const int64x2_t shift_18 = {-18, 18}; flipped = vandq_u64(vshlq_u64(pp_discs, shift_9), inner_oo_discs); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_9), inner_oo_discs)); adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_9)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs)); flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs)); legals = vorrq_u64(legals, vshlq_u64(flipped, shift_9)); return ((vgetq_lane_u64(legals, 0) | vgetq_lane_u64(legals, 1)) & ~(p_discs | o_discs)); } i even wrote a interleave version I can expect a speed gain, no?
Replies
3
Boosts
0
Views
630
Activity
Oct ’24
ARM NEON Shifting vector
Hi, is there an intrinsic instruction to shift a vector from another a shift vector? thank
Replies
1
Boosts
0
Views
699
Activity
Aug ’24
build.db : disk I/O error
hello, i have this error : error: accessing build database "/Users/.../Build/Intermediates.noindex/XCBuildData/build.db": disk I/O error only if i use a custom paths for derived data... how to fix it
Replies
0
Boosts
0
Views
366
Activity
Aug ’24
__ARM_NEON
I'm looking to do conditional programming with NEON. I try the __ARN_NEON tag. it doesn't work can you help me?
Replies
2
Boosts
0
Views
734
Activity
Aug ’24