As per the event updates shown in alarm stream, going down of discovery on one of the cfgm nodes is causing some of the processes(as listed below) to become non functional :
contrail-analytics-api
contrail-snmp-collector
contrail-alarm-gen
contrail-api
Below are the event updates being seen in alarm steam when discovery on one of the cfgm nodes goes down:
event: init
data: null
event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand2_value": "2", "json_operand1_value": "1", "rule": {"oper": "!=", "operand1": {"keys": ["BgpRouterState", "num_up_bgp_peer"]}, "operand2": {"keys": ["BgpRouterState", "num_bgp_peer"]}}}]}], "severity": 4, "ack": false, "timestamp": 1452507110222853, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA3MTEwMjIyODUzLCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "BgpConnectivity"}]}, "key": "ObjectBgpRouter:nodeg20"}
event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand2_value": "2", "json_operand1_value": "1", "rule": {"oper": "!=", "operand1": {"keys": ["BgpRouterState", "num_up_bgp_peer"]}, "operand2": {"keys": ["BgpRouterState", "num_bgp_peer"]}}}]}], "severity": 4, "ack": false, "timestamp": 1452507110246054, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA3MTEwMjQ2MDU0LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "BgpConnectivity"}]}, "key": "ObjectBgpRouter:nodea21"}
event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"PROCESS_STATE_STOPPED\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_info", "process_state"]}, "operand2": {"json_value": "\"PROCESS_STATE_RUNNING\""}}, "json_vars": {"NodeStatus.process_info.process_name": "contrail-discovery:0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509518459470, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTE4NDU5NDcwLCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessStatus"}]}, "key": "ObjectConfigNode:nodeg13"}
event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-analytics-api", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509521421732, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTIxNDIxNzMyLCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectCollectorInfo:nodeg20"}
event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-alarm-gen", "NodeStatus.process_status.instance_id": "0"}}]}, {"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-snmp-collector", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509523425514, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTIzNDI1NTE0LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectCollectorInfo:nodeg20"}
event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-alarm-gen", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509524523549, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTI0NTIzNTQ5LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectCollectorInfo:nodeg13"}
event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-api", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509524544307, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTI0NTQ0MzA3LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectConfigNode:nodea21"}
event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-alarm-gen", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509525440294, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTI1NDQwMjk0LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectCollectorInfo:nodeg20"}
event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-analytics-api", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509526435208, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTI2NDM1MjA4LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectCollectorInfo:nodeg13"}
event: update
data: {"type": "UVEAlarms", "value": null, "key": "ObjectCollectorInfo:nodeg20"}
event: update
data: {"type": "UVEAlarms", "value": null, "key": "ObjectConfigNode:nodea21"}
event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"PROCESS_STATE_STOPPED\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_info", "process_state"]}, "operand2": {"json_value": "\"PROCESS_STATE_RUNNING\""}}, "json_vars": {"NodeStatus.process_info.process_name": "contrail-discovery:0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509518459470, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTE4NDU5NDcwLCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessStatus"}, {"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-api", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509526521789, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTI2NTIxNzg5LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectConfigNode:nodeg13"}
event: update
data: {"type": "UVEAlarms", "value": null, "key": "ObjectCollectorInfo:nodeg13"}
event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"PROCESS_STATE_STOPPED\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_info", "process_state"]}, "operand2": {"json_value": "\"PROCESS_STATE_RUNNING\""}}, "json_vars": {"NodeStatus.process_info.process_name": "contrail-discovery:0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509518459470, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTE4NDU5NDcwLCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessStatus"}]}, "key": "ObjectConfigNode:nodeg13"}
Also pasting the contrail-status after the trigger:
root@nodeg13:~# contrail-status
== Contrail Analytics ==
supervisor-analytics: active
contrail-alarm-gen active
contrail-analytics-api active
contrail-analytics-nodemgr active
contrail-collector active
contrail-query-engine active
contrail-snmp-collector active
contrail-topology active
== Contrail Config ==
supervisor-config: active
contrail-api:0 active
contrail-config-nodemgr active
contrail-device-manager backup
contrail-discovery:0 inactive
contrail-schema backup
contrail-svc-monitor backup
ifmap active
== Contrail Web UI ==
supervisor-webui: active
contrail-webui active
contrail-webui-middleware active
== Contrail Database ==
contrail-database: active
supervisor-database: active
contrail-database-nodemgr active
kafka active
== Contrail Support Services ==
supervisor-support-service: active
rabbitmq-server active
nodea21:
root@nodea21:~# contrail-status
== Contrail vRouter ==
supervisor-vrouter: active
contrail-vrouter-agent active
contrail-vrouter-nodemgr active
== Contrail Control ==
supervisor-control: active
contrail-control active
contrail-control-nodemgr active
contrail-dns active
contrail-named active
== Contrail Analytics ==
supervisor-analytics: active
contrail-alarm-gen active
contrail-analytics-api active
contrail-analytics-nodemgr active
contrail-collector active
contrail-query-engine active
contrail-snmp-collector active
contrail-topology active
== Contrail Config ==
supervisor-config: active
contrail-api:0 active
contrail-config-nodemgr active
contrail-device-manager backup
contrail-discovery:0 active
contrail-schema backup
contrail-svc-monitor backup
ifmap active
== Contrail Database ==
contrail-database: active
supervisor-database: active
contrail-database-nodemgr active
kafka active
== Contrail Support Services ==
supervisor-support-service: active
rabbitmq-server active
nodeg20:
root@nodeg20:~# contrail-status
== Contrail vRouter ==
supervisor-vrouter: active
contrail-vrouter-agent active
contrail-vrouter-nodemgr active
== Contrail Control ==
supervisor-control: active
contrail-control active
contrail-control-nodemgr active
contrail-dns active
contrail-named active
== Contrail Analytics ==
supervisor-analytics: active
contrail-alarm-gen active
contrail-analytics-api active
contrail-analytics-nodemgr active
contrail-collector active
contrail-query-engine active
contrail-snmp-collector active
contrail-topology active
== Contrail Config ==
supervisor-config: active
contrail-api:0 active
contrail-config-nodemgr active
contrail-device-manager active
contrail-discovery:0 active
contrail-schema active
contrail-svc-monitor active
ifmap active
== Contrail Database ==
contrail-database: active
supervisor-database: active
contrail-database-nodemgr active
kafka active
== Contrail Support Services ==
supervisor-support-service: active
rabbitmq-server active
Testbed:
env.roledefs = {
'all': [host1, host2, host3],
'cfgm': [host1,host2,host3],
'webui': [host1],
'openstack': [host1],
'control': [host2, host3],
'collector': [host1, host2, host3],
'database': [host1, host2, host3],
'compute': [host2, host3],
'build': [host_build]
}
env.hostnames = {
'all': ['nodeg13', 'nodeh20', 'nodea21']
Initially, I thought stopping discovery is causing the problem to appear, but it keeps on happening even when discovery is functional. Discovery starts reporting contrail-alarm-gen initializing (Discovery: AlarmGenerator connection down) ( contrail-alarm-gen becoming flacky)
Pasting the content of alarm-steam:
event: init
data: null
event: update operand1_ value": "\"Non- Functional\ "", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus. process_ status. module_ id": "contrail- analytics- api", "NodeStatus. process_ status. instance_ id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452585208987553, "token": "eyJ0aW1lc3RhbX AiOiAxNDUyNTg1M jA4OTg3NTUzLCAi aHR0cF9wb3J0Ijo gNTk5NSwgImhvc3 RfaXAiOiAiMTAuM jA0LjIxNy41MyJ9 ", "type": "ProcessConnect ivity"} ]}, "key": "ObjectCollecto rInfo:nodeg13" }
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_
event: update operand1_ value": "\"Non- Functional\ "", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus. process_ status. module_ id": "contrail-api", "NodeStatus. process_ status. instance_ id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452585214985576, "token": "eyJ0aW1lc3RhbX AiOiAxNDUyNTg1M jE0OTg1NTc2LCAi aHR0cF9wb3J0Ijo gNTk5NSwgImhvc3 RfaXAiOiAiMTAuM jA0LjIxNy41MyJ9 ", "type": "ProcessConnect ivity"} ]}, "key": "ObjectConfigNo de:nodeg20" }
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_
event: update operand2_ value": "2", "json_operand1_ value": "1", "rule": {"oper": "!=", "operand1": {"keys": ["BgpRouterState", "num_up_ bgp_peer" ]}, "operand2": {"keys": ["BgpRouterState", "num_bgp_ peer"]} }}]}], "severity": 4, "ack": false, "timestamp": 1452582185230704, "token": "eyJ0aW1lc3RhbX AiOiAxNDUyNTgyM Tg1MjMwNzA0LCAi aHR0cF9wb3J0Ijo gNTk5NSwgImhvc3 RfaXAiOiAiMTAuM jA0LjIxNy41MyJ9 ", "type": "BgpConnectivit y"}]}, "key": "ObjectBgpRoute r:nodeg20" }
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_
event: update operand2_ value": "2", "json_operand1_ value": "1", "rule": {"oper": "!=", "operand1": {"keys": ["BgpRouterState", "num_up_ bgp_peer" ]}, "operand2": {"keys": ["BgpRouterState", "num_bgp_ peer"]} }}]}], "severity": 4, "ack": false, "timestamp": 1452582467621435, "token": "eyJ0aW1lc3RhbX AiOiAxNDUyNTgyN DY3NjIxNDM1LCAi aHR0cF9wb3J0Ijo gNTk5NSwgImhvc3 RfaXAiOiAiMTAuM jA0LjIxNy41MyJ9 ", "type": "BgpConnectivit y"}]}, "key": "ObjectBgpRoute r:nodea21" }
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_
event: update operand1_ value": "\"Non- Functional\ "", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus. process_ status. module_ id": "contrail-api", "NodeStatus. process_ status. instance_ id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452585219024927, "token": "eyJ0aW1lc3RhbX AiOiAxNDUyNTg1M jE5MDI0OTI3LCAi aHR0cF9w. ..
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_