Rebuild via: rm numad cc -g -O0 -fstack-protector-strong -std=gnu99 -I. -D__thread="" -Wdate-time -D_FORTIFY_SOURCE=2 -c -o numad.o numad.c cc -Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now numad.o -lpthread -lrt -lm -o numad ls -laF numad sudo mv numad /usr/bin/numad
My current config triggering this has a pretty common CPU list on ppc64el:
CPU(s): 160 On-line CPU(s) list: 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156 Off-line CPU(s) list: 1-3,5-7,9-11,13-15,17-19,21-23,25-27,29-31,33-35,37-39,41-43,45-47,49-51,53-55,57-59,61-63,65-67,69-71,73-75,77-79,81-83,85-87,89-91,93-95,97-99,101-103,105-107,109-111,113-115,117-119,121-123,125-127,129-131,133-135,137-139,141-143,145-147,149-151,153-155,157-159
The assumption seems to be correct, it was due to that cpu/node mismatch assuming always linear CPUs with cpu-number == index-in-array.
With the following change the breakage no more happens in my setup:
--- numad.c.orig 2019-06-17 09:27:49.783712059 +0000 +++ numad.c 2019-06-17 10:11:00.619113441 +0000 @@ -995,7 +995,18 @@ int node_id = 0; while (nodes) { if (ID_IS_IN_LIST(node_id, p->node_list_p)) { - OR_LISTS(cpu_bind_list_p, cpu_bind_list_p, node[node_id].cpu_list_p); + int id = -1; + for (int node_ix = 0; (node_ix < num_nodes); node_ix++) { + if (node[node_ix].node_id == node_id) { + id = node_ix; + break; + } + } + if (id == -1) { + numad_log(LOG_CRIT, "Node %d is requested, but unknown\n", node_id); + exit(EXIT_FAILURE); + } + OR_LISTS(cpu_bind_list_p, cpu_bind_list_p, node[id].cpu_list_p); nodes -= 1; } node_id += 1;
Rebuild via: protector- strong -std=gnu99 -I. -D__thread="" -Wdate-time -D_FORTIFY_SOURCE=2 -c -o numad.o numad.c cc -Wl,-Bsymbolic- functions -Wl,-z,relro -Wl,-z,now numad.o -lpthread -lrt -lm -o numad
rm numad
cc -g -O0 -fstack-
ls -laF numad
sudo mv numad /usr/bin/numad
My current config triggering this has a pretty common CPU list on ppc64el:
CPU(s): 160 16,20,24, 28,32,36, 40,44,48, 52,56,60, 64,68,72, 76,80,84, 88,92,96, 100,104, 108,112, 116,120, 124,128, 132,136, 140,144, 148,152, 156 9-11,13- 15,17-19, 21-23,25- 27,29-31, 33-35,37- 39,41-43, 45-47,49- 51,53-55, 57-59,61- 63,65-67, 69-71,73- 75,77-79, 81-83,85- 87,89-91, 93-95,97- 99,101- 103,105- 107,109- 111,113- 115,117- 119,121- 123,125- 127,129- 131,133- 135,137- 139,141- 143,145- 147,149- 151,153- 155,157- 159
On-line CPU(s) list: 0,4,8,12,
Off-line CPU(s) list: 1-3,5-7,
The assumption seems to be correct, it was due to that cpu/node mismatch assuming always linear CPUs with cpu-number == index-in-array.
With the following change the breakage no more happens in my setup:
--- numad.c.orig 2019-06-17 09:27:49.783712059 +0000 IN_LIST( node_id, p->node_list_p)) { cpu_bind_ list_p, cpu_bind_list_p, node[node_ id].cpu_ list_p) ; ix].node_ id == node_id) { cpu_bind_ list_p, cpu_bind_list_p, node[id] .cpu_list_ p);
+++ numad.c 2019-06-17 10:11:00.619113441 +0000
@@ -995,7 +995,18 @@
int node_id = 0;
while (nodes) {
if (ID_IS_
- OR_LISTS(
+ int id = -1;
+ for (int node_ix = 0; (node_ix < num_nodes); node_ix++) {
+ if (node[node_
+ id = node_ix;
+ break;
+ }
+ }
+ if (id == -1) {
+ numad_log(LOG_CRIT, "Node %d is requested, but unknown\n", node_id);
+ exit(EXIT_FAILURE);
+ }
+ OR_LISTS(
nodes -= 1;
}
node_id += 1;