Build 2695: Event updates for some of the processes becoming non functional are seen in alarm-stream when discovery on one of the nodes goes down

Bug #1532757 reported by Ankit Jain
6
This bug affects 1 person
Affects Status Importance Assigned to Milestone
Juniper Openstack
Status tracked in Trunk
Trunk
Invalid
Undecided
Sundaresan Rajangam

Bug Description

As per the event updates shown in alarm stream, going down of discovery on one of the cfgm nodes is causing some of the processes(as listed below) to become non functional :
contrail-analytics-api
contrail-snmp-collector
contrail-alarm-gen
contrail-api

Below are the event updates being seen in alarm steam when discovery on one of the cfgm nodes goes down:

event: init
data: null

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand2_value": "2", "json_operand1_value": "1", "rule": {"oper": "!=", "operand1": {"keys": ["BgpRouterState", "num_up_bgp_peer"]}, "operand2": {"keys": ["BgpRouterState", "num_bgp_peer"]}}}]}], "severity": 4, "ack": false, "timestamp": 1452507110222853, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA3MTEwMjIyODUzLCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "BgpConnectivity"}]}, "key": "ObjectBgpRouter:nodeg20"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand2_value": "2", "json_operand1_value": "1", "rule": {"oper": "!=", "operand1": {"keys": ["BgpRouterState", "num_up_bgp_peer"]}, "operand2": {"keys": ["BgpRouterState", "num_bgp_peer"]}}}]}], "severity": 4, "ack": false, "timestamp": 1452507110246054, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA3MTEwMjQ2MDU0LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "BgpConnectivity"}]}, "key": "ObjectBgpRouter:nodea21"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"PROCESS_STATE_STOPPED\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_info", "process_state"]}, "operand2": {"json_value": "\"PROCESS_STATE_RUNNING\""}}, "json_vars": {"NodeStatus.process_info.process_name": "contrail-discovery:0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509518459470, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTE4NDU5NDcwLCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessStatus"}]}, "key": "ObjectConfigNode:nodeg13"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-analytics-api", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509521421732, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTIxNDIxNzMyLCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectCollectorInfo:nodeg20"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-alarm-gen", "NodeStatus.process_status.instance_id": "0"}}]}, {"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-snmp-collector", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509523425514, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTIzNDI1NTE0LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectCollectorInfo:nodeg20"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-alarm-gen", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509524523549, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTI0NTIzNTQ5LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectCollectorInfo:nodeg13"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-api", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509524544307, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTI0NTQ0MzA3LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectConfigNode:nodea21"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-alarm-gen", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509525440294, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTI1NDQwMjk0LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectCollectorInfo:nodeg20"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-analytics-api", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509526435208, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTI2NDM1MjA4LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectCollectorInfo:nodeg13"}

event: update
data: {"type": "UVEAlarms", "value": null, "key": "ObjectCollectorInfo:nodeg20"}

event: update
data: {"type": "UVEAlarms", "value": null, "key": "ObjectConfigNode:nodea21"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"PROCESS_STATE_STOPPED\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_info", "process_state"]}, "operand2": {"json_value": "\"PROCESS_STATE_RUNNING\""}}, "json_vars": {"NodeStatus.process_info.process_name": "contrail-discovery:0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509518459470, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTE4NDU5NDcwLCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessStatus"}, {"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-api", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509526521789, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTI2NTIxNzg5LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectConfigNode:nodeg13"}

event: update
data: {"type": "UVEAlarms", "value": null, "key": "ObjectCollectorInfo:nodeg13"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"PROCESS_STATE_STOPPED\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_info", "process_state"]}, "operand2": {"json_value": "\"PROCESS_STATE_RUNNING\""}}, "json_vars": {"NodeStatus.process_info.process_name": "contrail-discovery:0"}}]}], "severity": 3, "ack": false, "timestamp": 1452509518459470, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTA5NTE4NDU5NDcwLCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessStatus"}]}, "key": "ObjectConfigNode:nodeg13"}

Also pasting the contrail-status after the trigger:

root@nodeg13:~# contrail-status
== Contrail Analytics ==
supervisor-analytics: active
contrail-alarm-gen active
contrail-analytics-api active
contrail-analytics-nodemgr active
contrail-collector active
contrail-query-engine active
contrail-snmp-collector active
contrail-topology active

== Contrail Config ==
supervisor-config: active
contrail-api:0 active
contrail-config-nodemgr active
contrail-device-manager backup
contrail-discovery:0 inactive
contrail-schema backup
contrail-svc-monitor backup
ifmap active

== Contrail Web UI ==
supervisor-webui: active
contrail-webui active
contrail-webui-middleware active

== Contrail Database ==
contrail-database: active
supervisor-database: active
contrail-database-nodemgr active
kafka active

== Contrail Support Services ==
supervisor-support-service: active
rabbitmq-server active

nodea21:

root@nodea21:~# contrail-status
== Contrail vRouter ==
supervisor-vrouter: active
contrail-vrouter-agent active
contrail-vrouter-nodemgr active

== Contrail Control ==
supervisor-control: active
contrail-control active
contrail-control-nodemgr active
contrail-dns active
contrail-named active

== Contrail Analytics ==
supervisor-analytics: active
contrail-alarm-gen active
contrail-analytics-api active
contrail-analytics-nodemgr active
contrail-collector active
contrail-query-engine active
contrail-snmp-collector active
contrail-topology active

== Contrail Config ==
supervisor-config: active
contrail-api:0 active
contrail-config-nodemgr active
contrail-device-manager backup
contrail-discovery:0 active
contrail-schema backup
contrail-svc-monitor backup
ifmap active

== Contrail Database ==
contrail-database: active
supervisor-database: active
contrail-database-nodemgr active
kafka active

== Contrail Support Services ==
supervisor-support-service: active
rabbitmq-server active

nodeg20:

root@nodeg20:~# contrail-status
== Contrail vRouter ==
supervisor-vrouter: active
contrail-vrouter-agent active
contrail-vrouter-nodemgr active

== Contrail Control ==
supervisor-control: active
contrail-control active
contrail-control-nodemgr active
contrail-dns active
contrail-named active

== Contrail Analytics ==
supervisor-analytics: active
contrail-alarm-gen active
contrail-analytics-api active
contrail-analytics-nodemgr active
contrail-collector active
contrail-query-engine active
contrail-snmp-collector active
contrail-topology active

== Contrail Config ==
supervisor-config: active
contrail-api:0 active
contrail-config-nodemgr active
contrail-device-manager active
contrail-discovery:0 active
contrail-schema active
contrail-svc-monitor active
ifmap active

== Contrail Database ==
contrail-database: active
supervisor-database: active
contrail-database-nodemgr active
kafka active

== Contrail Support Services ==
supervisor-support-service: active
rabbitmq-server active

Testbed:

env.roledefs = {
    'all': [host1, host2, host3],
    'cfgm': [host1,host2,host3],
    'webui': [host1],
    'openstack': [host1],
    'control': [host2, host3],
    'collector': [host1, host2, host3],
    'database': [host1, host2, host3],
    'compute': [host2, host3],
    'build': [host_build]
}

env.hostnames = {
    'all': ['nodeg13', 'nodeh20', 'nodea21']

Ankit Jain (ankitja)
Changed in juniperopenstack:
milestone: none → r3.0-fcs
Revision history for this message
Ankit Jain (ankitja) wrote :
Download full text (39.8 KiB)

Initially, I thought stopping discovery is causing the problem to appear, but it keeps on happening even when discovery is functional. Discovery starts reporting contrail-alarm-gen initializing (Discovery:AlarmGenerator connection down) ( contrail-alarm-gen becoming flacky)

Pasting the content of alarm-steam:

event: init
data: null

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-analytics-api", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452585208987553, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTg1MjA4OTg3NTUzLCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectCollectorInfo:nodeg13"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-api", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452585214985576, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTg1MjE0OTg1NTc2LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "ProcessConnectivity"}]}, "key": "ObjectConfigNode:nodeg20"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand2_value": "2", "json_operand1_value": "1", "rule": {"oper": "!=", "operand1": {"keys": ["BgpRouterState", "num_up_bgp_peer"]}, "operand2": {"keys": ["BgpRouterState", "num_bgp_peer"]}}}]}], "severity": 4, "ack": false, "timestamp": 1452582185230704, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTgyMTg1MjMwNzA0LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "BgpConnectivity"}]}, "key": "ObjectBgpRouter:nodeg20"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand2_value": "2", "json_operand1_value": "1", "rule": {"oper": "!=", "operand1": {"keys": ["BgpRouterState", "num_up_bgp_peer"]}, "operand2": {"keys": ["BgpRouterState", "num_bgp_peer"]}}}]}], "severity": 4, "ack": false, "timestamp": 1452582467621435, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTgyNDY3NjIxNDM1LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9", "type": "BgpConnectivity"}]}, "key": "ObjectBgpRouter:nodea21"}

event: update
data: {"type": "UVEAlarms", "value": {"alarms": [{"any_of": [{"all_of": [{"json_operand1_value": "\"Non-Functional\"", "rule": {"oper": "!=", "operand1": {"keys": ["NodeStatus", "process_status", "state"]}, "operand2": {"json_value": "\"Functional\""}}, "json_vars": {"NodeStatus.process_status.module_id": "contrail-api", "NodeStatus.process_status.instance_id": "0"}}]}], "severity": 3, "ack": false, "timestamp": 1452585219024927, "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTg1MjE5MDI0OTI3LCAiaHR0cF9w...

Revision history for this message
Ankit Jain (ankitja) wrote :
Download full text (3.7 KiB)

Also pasting http://nodeg13:8081/analytics/uves/config-node/nodeg13?flat:

{

    "control-node":

[

{

    "name": "nodea21",
    "value":

{

    "UVEAlarms":

{

    "alarms":

[

{

    "any_of":

[

{

    "all_of":

[

{

    "json_operand2_value": "2",
    "json_operand1_value": "1",
    "rule":

{

    "oper": "!=",
    "operand1":

{

    "keys":

    [
        "BgpRouterState",
        "num_up_bgp_peer"
    ]

},
"operand2":
{

    "keys":

                                            [
                                                "BgpRouterState",
                                                "num_bgp_peer"
                                            ]
                                        }
                                    }
                                }
                            ]
                        }
                    ],
                    "severity": ​4,
                    "ack": false,
                    "timestamp": ​1452582467621435,
                    "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTgyNDY3NjIxNDM1LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9",
                    "type": "BgpConnectivity"
                }
            ]
        }
    }

},
{

    "name": "nodeg20",
    "value":

{

    "UVEAlarms":

{

    "alarms":

[

{

    "any_of":

[

{

    "all_of":

[

{

    "json_operand2_value": "2",
    "json_operand1_value": "1",
    "rule":

{

    "oper": "!=",
    "operand1":

{

    "keys":

    [
        "BgpRouterState",
        "num_up_bgp_peer"
    ]

},
"operand2":
{

    "keys":

                                                [
                                                    "BgpRouterState",
                                                    "num_bgp_peer"
                                                ]
                                            }
                                        }
                                    }
                                ]
                            }
                        ],
                        "severity": ​4,
                        "ack": false,
                        "timestamp": ​1452582185230704,
                        "token": "eyJ0aW1lc3RhbXAiOiAxNDUyNTgyMTg1MjMwNzA0LCAiaHR0cF9wb3J0IjogNTk5NSwgImhvc3RfaXAiOiAiMTAuMjA0LjIxNy41MyJ9",
                        "type": "BgpConnectivity"
                    }
                ]
            }
        }
    }

],
"analytics-node":
[

{

    "name": "nodeg13",
    "value":

{

    "UVEAlarms":

{

    "alarms":

[

{

    "any_of":

[

{

    "all_of":

[

{

    "json_operand1_value": "\"Non-Functional\"",
    "rule":

{

    "oper": "!=",
    "operand1":

{

    "keys":

    [
        "NodeStatus",
        "process_status",
        "state"
    ]

},
"operand2":

    {
        "json_value": "\"Functional\""
    }

},
"json_vars":

                                            {
                                                "NodeStatus.process_status.module_id": "contrail-analytics-api",
                                                "NodeStatus.process_status.instance_id": "0"
                                            }
         ...

Read more...

Revision history for this message
Raj Reddy (rajreddy) wrote :

when an HA event happens and there is a period of time, the status will be 'down' in some sense.. that immediately gets reflected in alarm-gen and hence alarms will show up.

In 3.1, we will implement a feature called 'alarm sinking/debouncing' that will wait a few seconds before generating the alarm.

To post a comment you must log in.
This report contains Public information  
Everyone can see this information.

Other bug subscribers

Remote bug watches

Bug watches keep track of this bug in other bug trackers.