notifier should not log traceback if there is a problem getting stats from the instance

Bug #1206731 reported by Doug Hellmann
20
This bug affects 7 people
Affects Status Importance Assigned to Milestone
Ceilometer
Fix Released
High
Guangyu Suo

Bug Description

The ceilometer notifier plugin for nova is logging tracebacks when asked to send notifications for instances where libvirt does not have the VM.

From the test log: http://logs-dev.openstack.org/21/37821/15/check/gate-tempest-devstack-vm-full/37529/logs/screen-n-cpu.txt.gz#_2013-07-30_22_31_29_407

 1531 2013-07-30 22:31:29.403 DEBUG nova.ceilometer.notifier [req-a8e61dac-c9ef-4282-bab8-7e2273c89a97 demo demo] INFO {u'vm_state': u'active', u'availability_zone': None, u'terminated_at': None, u' ephemeral_gb': 0, u'instance_type_id': 2, u'user_data': None, u'vm_mode': None, u'deleted_at': None, u'reservation_id': u'r-gxnkk1cj', u'id': 1, u'security_groups': [{u'project_id': u'77f00b20 74964b809d095e42290a33fa', u'user_id': u'd9e252983acb4a28a5a3538ca73370df', u'name': u'euca_secgroup', u'deleted': 0, u'created_at': u'2013-07-30T22:31:00.000000', u'updated_at': None, u'rules ': [], u'deleted_at': None, u'id': 2, u'description': u'euca_secgroup description'}], u'disable_terminate': False, u'root_device_name': u'/dev/vda', u'user_id': u'd9e252983acb4a28a5a3538ca7337 0df', u'uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'default_swap_device': None, u'info_cache': {u'instance_uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'deleted': 1, u'created_at': u '2013-07-30T22:31:01.000000', u'updated_at': u'2013-07-30T22:31:27.000000', u'network_info': u'[{"ovs_interfaceid": null, "network": {"bridge": "br100", "subnets": [{"ips": [{"meta": {}, "vers ion": 4, "type": "fixed", "floating_ips": [], "address": "10.1.0.2"}], "version": 4, "meta": {"dhcp_server": "10.1.0.1"}, "dns": [{"meta": {}, "version": 4, "type": "dns", "address": "8.8.4.4" }], "routes": [], "cidr": "10.1.0.0/24", "gateway": {"meta": {}, "version": 4, "type": "gateway", "address": "10.1.0.1"}}, {"ips": [], "version": null, "meta": {"dhcp_server": null}, "dns": [] , "routes": [], "cidr": null, "gateway": {"meta": {}, "version": null, "type": "gateway", "address": null}}], "meta": {"tenant_id": null, "should_create_bridge": true, "bridge_interface": "eth 0"}, "id": "5d2121a6-4067-43a5-8ac1-7625c4fde438", "label": "private"}, "devname": null, "qbh_params": null, "meta": {}, "address": "fa:16:3e:ac:c0:49", "type": "bridge", "id": "61107fa8-e5a4- 44c0-a74d-8290bf69a54c", "qbg_params": null}]', u'deleted_at': u'2013-07-30T22:31:29.000000', u'id': 1}, u'hostname': u'server-0904ceec-1a2b-48f4-90fa-046e95844e35', u'launched_on': u'devstack -1375221660', u'display_description': u'', u'key_data': None, u'deleted': 0, u'scheduled_at': u'2013-07-30T22:31:01.000000', u'power_state': 1, u'default_ephemeral_device': None, u'progress': 0, u'project_id': u'77f00b2074964b809d095e42290a33fa', u'launched_at': u'2013-07-30T22:31:10.000000', u'config_drive': u'', u'node': u'devstack-1375221660', u'ramdisk_id': u'934a5d35-f044-4bc4 -9b0a-8231c5693273', u'access_ip_v6': None, u'access_ip_v4': None, u'kernel_id': u'abe1496c-1ae6-4ab4-a18f-d949901d6cb5', u'key_name': None, u'updated_at': u'2013-07-30T22:31:29.000000', u'hos t': u'devstack-1375221660', u'display_name': u'Server 0904ceec-1a2b-48f4-90fa-046e95844e35', u'system_metadata': [{u'instance_uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'deleted': 0, u'c reated_at': u'2013-07-30T22:31:01.000000', u'updated_at': None, u'value': u'abe1496c-1ae6-4ab4-a18f-d949901d6cb5', u'key': u'image_kernel_id', u'deleted_at': None, u'id': 1}, {u'instance_uuid' : u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'deleted': 0, u'created_at': u'2013-07-30T22:31:01.000000', u'updated_at': None, u'value': u'512', u'key': u'instance_type_memory_mb', u'deleted_at' : None, u'id': 2}, {u'instance_uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'deleted': 0, u'created_at': u'2013-07-30T22:31:01.000000', u'updated_at': None, u'value': u'0', u'key': u'insta nce_type_swap', u'deleted_at': None, u'id': 3}, {u'instance_uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'deleted': 0, u'created_at': u'2013-07-30T22:31:01.000000', u'updated_at': None, u' value': None, u'key': u'instance_type_vcpu_weight', u'deleted_at': None, u'id': 4}, {u'instance_uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'deleted': 0, u'created_at': u'2013-07-30T22:31 :01.000000', u'updated_at': None, u'value': u'1', u'key': u'instance_type_root_gb', u'deleted_at': None, u'id': 5}, {u'instance_uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'deleted': 0, u 'created_at': u'2013-07-30T22:31:01.000000', u'updated_at': None, u'value': u'2', u'key': u'instance_type_id', u'deleted_at': None, u'id': 6}, {u'instance_uuid': u'0904ceec-1a2b-48f4-90fa-046e 95844e35', u'deleted': 0, u'created_at': u'2013-07-30T22:31:01.000000', u'updated_at': None, u'value': u'934a5d35-f044-4bc4-9b0a-8231c5693273', u'key': u'image_ramdisk_id', u'deleted_at': None , u'id': 7}, {u'instance_uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'deleted': 0, u'created_at': u'2013-07-30T22:31:01.000000', u'updated_at': None, u'value': u'm1.tiny', u'key': u'insta nce_type_name', u'deleted_at': None, u'id': 8}, {u'instance_uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'deleted': 0, u'created_at': u'2013-07-30T22:31:01.000000', u'updated_at': None, u' value': u'0', u'key': u'instance_type_ephemeral_gb', u'deleted_at': None, u'id': 9}, {u'instance_uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'deleted': 0, u'created_at': u'2013-07-30T22:3 1:01.000000', u'updated_at': None, u'value': u'1', u'key': u'instance_type_rxtx_factor', u'deleted_at': None, u'id': 10}, {u'instance_uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'deleted' : 0, u'created_at': u'2013-07-30T22:31:01.000000', u'updated_at': None, u'value': u'1', u'key': u'instance_type_flavorid', u'deleted_at': None, u'id': 11}, {u'instance_uuid': u'0904ceec-1a2b-4 8f4-90fa-046e95844e35', u'deleted': 0, u'created_at': u'2013-07-30T22:31:01.000000', u'updated_at': None, u'value': u'1', u'key': u'instance_type_vcpus', u'deleted_at': None, u'id': 12}, {u'in stance_uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'deleted': 0, u'created_at': u'2013-07-30T22:31:01.000000', u'updated_at': None, u'value': u'2a9eb3c3-ff82-4627-9f10-fd0c74da3109', u'ke y': u'image_base_image_ref', u'deleted_at': None, u'id': 13}], u'task_state': u'deleting', u'shutdown_terminate': False, u'cell_name': None, u'root_gb': 1, u'locked': False, u'name': u'instanc e-00000001', u'created_at': u'2013-07-30T22:31:01.000000', u'launch_index': 0, u'memory_mb': 512, u'vcpus': 1, u'image_ref': u'2a9eb3c3-ff82-4627-9f10-fd0c74da3109', u'architecture': None, u'a uto_disk_config': None, u'os_type': None, u'metadata': []} __init__ /opt/stack/new/ceilometer/ceilometer/compute/nova_notifier.py:107
 1532 2013-07-30 22:31:29.403 DEBUG ceilometer.compute.virt.libvirt.inspector [req-a8e61dac-c9ef-4282-bab8-7e2273c89a97 demo demo] Connecting to libvirt: qemu:///system _get_connection /opt/stack/ne w/ceilometer/ceilometer/compute/virt/libvirt/inspector.py:63
 1533 2013-07-30 22:31:29.407 ERROR nova.openstack.common.notifier.api [req-a8e61dac-c9ef-4282-bab8-7e2273c89a97 demo demo] Problem 'Error from libvirt while looking up None: [Error Code 8] invalid argument: virDomainLookupByName' attempting to send to notification system. Payload={'state_description': '', 'availability_zone': None, 'ephemeral_gb': 0, 'instance_type_id': 2, 'deleted_at': '', 'reservation_id': u'r-gxnkk1cj', 'instance_id': u'0904ceec-1a2b-48f4-90fa-046e95844e35', 'user_id': u'd9e252983acb4a28a5a3538ca73370df', 'hostname': u'server-0904ceec-1a2b-48f4-90fa-046e9 5844e35', 'state': u'active', 'launched_at': '2013-07-30T22:31:10.000000', 'metadata': {}, 'node': u'devstack-1375221660', 'ramdisk_id': u'934a5d35-f044-4bc4-9b0a-8231c5693273', 'access_ip_v6' : None, 'disk_gb': 1, 'access_ip_v4': None, 'kernel_id': u'abe1496c-1ae6-4ab4-a18f-d949901d6cb5', 'host': u'devstack-1375221660', 'display_name': u'Server 0904ceec-1a2b-48f4-90fa-046e95844e35' , 'image_ref_url': u'http://10.3.195.40:9292/images/2a9eb3c3-ff82-4627-9f10-fd0c74da3109', 'root_gb': 1, 'tenant_id': u'77f00b2074964b809d095e42290a33fa', 'created_at': '2013-07-30T22:31:01.00 0000', 'memory_mb': 512, 'instance_type': 'm1.tiny', 'vcpus': 1, 'image_meta': {u'kernel_id': u'abe1496c-1ae6-4ab4-a18f-d949901d6cb5', u'ramdisk_id': u'934a5d35-f044-4bc4-9b0a-8231c5693273', u 'base_image_ref': u'2a9eb3c3-ff82-4627-9f10-fd0c74da3109'}, 'architecture': None, 'os_type': None}
 1534 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api Traceback (most recent call last):
 1535 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api File "/opt/stack/new/nova/nova/openstack/common/notifier/api.py", line 143, in notify
 1536 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api driver.notify(context, msg)
 1537 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api File "/opt/stack/new/ceilometer/ceilometer/compute/nova_notifier.py", line 153, in notify
 1538 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api counters = gatherer(instance)
 1539 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api File "/opt/stack/new/ceilometer/ceilometer/compute/nova_notifier.py", line 72, in __call__
 1540 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api results.extend(clist)
 1541 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api File "/opt/stack/new/ceilometer/ceilometer/compute/pollsters/disk.py", line 85, in get_counters
 1542 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api instance_name,
 1543 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api File "/opt/stack/new/ceilometer/ceilometer/compute/pollsters/disk.py", line 58, in _populate_cache
 1544 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api for disk, info in inspector.inspect_disks(instance_name):
 1545 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api File "/opt/stack/new/ceilometer/ceilometer/compute/virt/libvirt/inspector.py", line 130, in inspect_disks
 1546 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api domain = self._lookup_by_name(instance_name)
 1547 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api File "/opt/stack/new/ceilometer/ceilometer/compute/virt/libvirt/inspector.py", line 87, in _lookup_by_name
 1548 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api raise virt_inspector.InstanceNotFoundException(msg)
 1549 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api InstanceNotFoundException: Error from libvirt while looking up None: [Error Code 8] invalid argument: virDomainLookupByNa me
 1550 2013-07-30 22:31:29.407 26769 TRACE nova.openstack.common.notifier.api
 1551 2013-07-30 22:31:29.408 AUDIT nova.compute.manager [req-a8e61dac-c9ef-4282-bab8-7e2273c89a97 demo demo] [instance: 0904ceec-1a2b-48f4-90fa-046e95844e35] Terminating instance

Revision history for this message
Doug Hellmann (doug-hellmann) wrote :

It's not clear if this problem is a race condition in the way notifications are generated, bad logic in our code deciding which VM is being referenced, or something else entirely. In any case, we should log an error without a traceback. The argument we're passing to libvirt is clearly bad, so we shouldn't make the call at all when we get None as the instance's vm id.

Changed in ceilometer:
importance: Undecided → High
status: New → Confirmed
Revision history for this message
Doug Hellmann (doug-hellmann) wrote :

Oh, and warning, that log file is 35 MB.

summary: - notifier should not log traceback if VM does not exist
+ notifier should not log traceback if there is a problem getting stats
+ from the instance
Julien Danjou (jdanjou)
Changed in ceilometer:
milestone: none → havana-3
Guangyu Suo (yugsuo)
Changed in ceilometer:
assignee: nobody → Guangyu Suo (yugsuo)
Guangyu Suo (yugsuo)
Changed in ceilometer:
status: Confirmed → In Progress
Revision history for this message
Guangyu Suo (yugsuo) wrote :

I have got the reason for this bug, it 's really a complicated reason. In a summary, when deleting instance, the ceilometer notifier plugin for nova got none instance_name which used by disk pollster plugin's inspector: https://github.com/openstack/ceilometer/blob/master/ceilometer/compute/pollsters/disk.py#L58

The reason fired this bug comes from the following codes:

1. In notifier plugin, it use nova.conduct.API to get instance info, but this API get instance_name by the key "name":
instance_ref = {
    "name": "instance-00000001",
}

https://github.com/openstack/ceilometer/blob/master/ceilometer/compute/nova_notifier.py#L140

2. Then nova.notifications get payload from instance_ref, unfortunately it doesn't contail the instance_name, this is the reason why the plugin got the none instance_name.

https://github.com/openstack/ceilometer/blob/master/ceilometer/compute/nova_notifier.py#L146

3. However, the plugin get_counters() get instance_name by the key "OS-EXT-SRV-ATTR:instance_name"

def instance_name(instance):
    return getattr(instance, 'OS-EXT-SRV-ATTR:instance_name', None)

https://github.com/openstack/ceilometer/blob/master/ceilometer/compute/pollsters/util.py#L118

4. The 'OS-EXT-SRV-ATTR:instance_name' key was returned by nova_client.instance_get_all_by_host():

instance.__dict__ = {
        OS-EXT-SRV-ATTR:instance_name: "instance-00000001",
}

which can be used by Compute AgentManager correctly.

Revision history for this message
Guangyu Suo (yugsuo) wrote :

I don't know why not use nova_client to get instance info instead of conduct.API in nova notifier plugin. Because the pollster.utils are all based on nova_client to code, and it is conflict to conduct.API

Revision history for this message
Guangyu Suo (yugsuo) wrote :

Oh, the notifier plugin will be called within nova, so it can't use nova_client, should use conduct.API directly.

Revision history for this message
Doug Hellmann (doug-hellmann) wrote :

The notifier plugin is running *inside nova*. We shouldn't need to use nova's REST API to talk to nova when we're inside nova.

Which notification event_type causes the problem? If it is instance.delete.end, then we can ignore the fact that the name is none because it means the instance really does not exist at that point.

Revision history for this message
Guangyu Suo (yugsuo) wrote :

yeah, i got it. it is the 'compute.instance.delete.start' event causes the problem, when deleting the instance. The result returned by conduct.API is quite different from nova_client, but compute.pollster.utils take them the same.

Revision history for this message
Guangyu Suo (yugsuo) wrote :

I think we should do more work at https://github.com/openstack/ceilometer/blob/master/ceilometer/compute/nova_notifier.py#L96 , because the Instance now doesn't look like what novaclient gives pollsters.

Revision history for this message
Guangyu Suo (yugsuo) wrote :

Every time deleting an instance, you will get this error, and the notification will not be sent out.

Revision history for this message
Doug Hellmann (doug-hellmann) wrote :

I'm fairly certain that we were getting the instance name at one time, otherwise this would never have worked. I wonder if something changed in nova, or if we need to be making an additional call?

Revision history for this message
OpenStack Infra (hudson-openstack) wrote : Fix proposed to ceilometer (master)

Fix proposed to branch: master
Review: https://review.openstack.org/40098

Revision history for this message
Guangyu Suo (yugsuo) wrote :

You may see a full diff version instance dict returned by nova_client and conductor.API:

http://paste.openstack.org/show/43144/

Revision history for this message
OpenStack Infra (hudson-openstack) wrote : Fix merged to ceilometer (master)

Reviewed: https://review.openstack.org/40098
Committed: http://github.com/openstack/ceilometer/commit/f73f3e17af19e0e927a1854806835a0894537ecb
Submitter: Jenkins
Branch: master

commit f73f3e17af19e0e927a1854806835a0894537ecb
Author: Guangyu Suo <email address hidden>
Date: Thu Aug 8 06:39:40 2013 +0800

    Fixes failed notification when deleting instance

    In the notifier plugin for nova, the class Instance that translates instance_ref
    returned by conduct.API to nova_client like Instance is not very robust, which
    results some attributes are none or missing, and then causes pollsters raise
    AttributeError or NoneTypeError

    In this change, add the missing or none attributes to the Instance class.

    Change-Id: I3e5aa6382021633a6bf7240bf58d3c3bed074785
    Fixs: bug #1206731

Thierry Carrez (ttx)
Changed in ceilometer:
milestone: havana-3 → havana-rc1
Revision history for this message
Guangyu Suo (yugsuo) wrote : Re: [Bug 1206731] Re: notifier should not log traceback if there is a problem getting stats from the instance
Download full text (11.9 KiB)

hello, Thierry, this bug has been fixed:
https://review.openstack.org/#/c/40098/

2013/9/5 Thierry Carrez <email address hidden>

> ** Changed in: ceilometer
> Milestone: havana-3 => havana-rc1
>
> --
> You received this bug notification because you are a bug assignee.
> https://bugs.launchpad.net/bugs/1206731
>
> Title:
> notifier should not log traceback if there is a problem getting stats
> from the instance
>
> Status in Ceilometer:
> In Progress
>
> Bug description:
> The ceilometer notifier plugin for nova is logging tracebacks when
> asked to send notifications for instances where libvirt does not have
> the VM.
>
> From the test log: http://logs-dev.openstack.org/21/37821/15/check
> /gate-tempest-devstack-vm-
> full/37529/logs/screen-n-cpu.txt.gz#_2013-07-30_22_31_29_407
>
> 1531 2013-07-30 22:31:29.403 DEBUG nova.ceilometer.notifier
> [req-a8e61dac-c9ef-4282-bab8-7e2273c89a97 demo demo] INFO {u'vm_state':
> u'active', u'availability_zone': None, u'terminated_at': None, u'
> ephemeral_gb': 0, u'instance_type_id': 2, u'user_data': None, u'vm_mode':
> None, u'deleted_at': None, u'reservation_id': u'r-gxnkk1cj', u'id': 1,
> u'security_groups': [{u'project_id': u'77f00b20
> 74964b809d095e42290a33fa', u'user_id':
> u'd9e252983acb4a28a5a3538ca73370df', u'name': u'euca_secgroup', u'deleted':
> 0, u'created_at': u'2013-07-30T22:31:00.000000', u'updated_at': None,
> u'rules ': [], u'deleted_at': None, u'id': 2, u'description':
> u'euca_secgroup description'}], u'disable_terminate': False,
> u'root_device_name': u'/dev/vda', u'user_id':
> u'd9e252983acb4a28a5a3538ca7337 0df', u'uuid':
> u'0904ceec-1a2b-48f4-90fa-046e95844e35', u'default_swap_device': None,
> u'info_cache': {u'instance_uuid': u'0904ceec-1a2b-48f4-90fa-046e95844e35',
> u'deleted': 1, u'created_at': u '2013-07-30T22:31:01.000000',
> u'updated_at': u'2013-07-30T22:31:27.000000', u'network_info':
> u'[{"ovs_interfaceid": null, "network": {"bridge": "br100", "subnets":
> [{"ips": [{"meta": {}, "vers ion": 4, "type": "fixed", "floating_ips":
> [], "address": "10.1.0.2"}], "version": 4, "meta": {"dhcp_server":
> "10.1.0.1"}, "dns": [{"meta": {}, "version": 4, "type": "dns", "address":
> "8.8.4.4" }], "routes": [], "cidr": "10.1.0.0/24", "gateway":
> {"meta": {}, "version": 4, "type": "gateway", "address": "10.1.0.1"}},
> {"ips": [], "version": null, "meta": {"dhcp_server": null}, "dns": []
> , "routes": [], "cidr": null, "gateway": {"meta": {}, "version": null,
> "type": "gateway", "address": null}}], "meta": {"tenant_id": null,
> "should_create_bridge": true, "bridge_interface": "eth 0"}, "id":
> "5d2121a6-4067-43a5-8ac1-7625c4fde438", "label": "private"}, "devname":
> null, "qbh_params": null, "meta": {}, "address": "fa:16:3e:ac:c0:49",
> "type": "bridge", "id": "61107fa8-e5a4- 44c0-a74d-8290bf69a54c",
> "qbg_params": null}]', u'deleted_at': u'2013-07-30T22:31:29.000000', u'id':
> 1}, u'hostname': u'server-0904ceec-1a2b-48f4-90fa-046e95844e35',
> u'launched_on': u'devstack -1375221660', u'display_description': u'',
> u'key_data': None, u'deleted': 0, u'scheduled_at':
> u'2013-07-30T22:31:01.000000', u'power_s...

Julien Danjou (jdanjou)
Changed in ceilometer:
milestone: havana-rc1 → havana-3
status: In Progress → Fix Committed
Thierry Carrez (ttx)
Changed in ceilometer:
status: Fix Committed → Fix Released
Thierry Carrez (ttx)
Changed in ceilometer:
milestone: havana-3 → 2013.2
To post a comment you must log in.
This report contains Public information  
Everyone can see this information.

Duplicates of this bug

Other bug subscribers

Remote bug watches

Bug watches keep track of this bug in other bug trackers.