Bug #322887 “Concurrent requests requiring updates results in bl...” : Bugs : certify-web

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-29:

#1

In an attempt to obtain a stack trace of the threads when they are in a blocked state, the following code was added which generates a trace for all running threads when SIGUSR2 is sent to the process. The only problem is that when the appserver is blocked, the signal is never caught. However, it works fine when the appserver is started so Zope3 doesn't seem to be overwriting the signal.

import signal
import thread
import threading
import traceback
def setup_trace_handler():
"""Configure the Python interpreter to dump stack traces for all running threads on a SIGUSR2"""

    # if threading.Thread.getIdent does not exist, monkeypatch threading.Thread
    if not hasattr(threading.Thread, 'getIdent'):
        old_bootstrap = threading.Thread._Thread__bootstrap
        def _monkeypatch_Thread__bootstrap(self):
            self._Thread__ident = thread.get_ident()
            return old_bootstrap(self)
        def _monkeypatch_Thread_getIdent(self):
            if not hasattr(self, '_Thread__ident'):
                return None
            return self._Thread__ident
        threading.Thread._Thread__bootstrap = _monkeypatch_Thread__bootstrap
        threading.Thread.getIdent = _monkeypatch_Thread_getIdent

    def _dump_traces(_, __):
        try:
            print >>sys.stderr, 'Threads:'
            thread_id_map = {}
            for thread_obj in threading.enumerate():
                thread_id_map[thread_obj.getIdent()] = thread_obj.getName()
            for (thread_id, frame) in sys._current_frames().items():
                print >>sys.stderr, '%s' % (thread_id_map.get(thread_id, 'threadid-%s' % (thread_id,)))
                traceback.print_stack(frame, file=sys.stderr)
        except Exception, e:
            print >>sys.stderr, e
    signal.signal(signal.SIGUSR2, _dump_traces)

Changed in certify-web:
assignee:	nobody → cr3
importance:	Undecided → Critical
status:	New → Confirmed

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-29:

#2

After attaching to the Python process with gdb, the following stack trace seems to indicate that the appserver is locked on acquiring a thread. This is a call done in Python, so at least the problem doesn't seem to be in a compiled module.

#0 0xb7f60410 in __kernel_vsyscall ()
#1 0xb7f44d05 in sem_wait@@GLIBC_2.1 () from /lib/tls/i686/cmov/libpthread.so.0
#2 0x080f2445 in PyThread_acquire_lock ()
#3 0x080c33fc in PyEval_RestoreThread ()
#4 0x08070194 in ?? ()
#5 0x080c9ab3 in PyEval_EvalFrameEx ()
#6 0x080c96e5 in PyEval_EvalFrameEx ()
#7 0x080c96e5 in PyEval_EvalFrameEx ()
#8 0x080c96e5 in PyEval_EvalFrameEx ()
#9 0x080c96e5 in PyEval_EvalFrameEx ()
#10 0x080cb1f7 in PyEval_EvalCodeEx ()
#11 0x081136b6 in ?? ()
#12 0x0805cb97 in PyObject_Call ()
#13 0x08062bfb in ?? ()
#14 0x0805cb97 in PyObject_Call ()
#15 0x080c2e9c in PyEval_CallObjectWithKeywords ()
#16 0x080c27c4 in ?? ()
#17 0x080c9ab3 in PyEval_EvalFrameEx ()
#18 0x080cb1f7 in PyEval_EvalCodeEx ()
#19 0x080c93fe in PyEval_EvalFrameEx ()
#20 0x080c96e5 in PyEval_EvalFrameEx ()
#21 0x080c96e5 in PyEval_EvalFrameEx ()
#22 0x080cb1f7 in PyEval_EvalCodeEx ()
#23 0x080c93fe in PyEval_EvalFrameEx ()
#24 0x080cb1f7 in PyEval_EvalCodeEx ()
#25 0x080c93fe in PyEval_EvalFrameEx ()
#26 0x080c96e5 in PyEval_EvalFrameEx ()
#27 0x080c96e5 in PyEval_EvalFrameEx ()
#28 0x080c96e5 in PyEval_EvalFrameEx ()
#29 0x080c96e5 in PyEval_EvalFrameEx ()
#30 0x080c96e5 in PyEval_EvalFrameEx ()
#31 0x080c96e5 in PyEval_EvalFrameEx ()
#32 0x080cb1f7 in PyEval_EvalCodeEx ()
#33 0x081136b6 in ?? ()
#34 0x0805cb97 in PyObject_Call ()
#35 0x080c7e04 in PyEval_EvalFrameEx ()
#36 0x080c96e5 in PyEval_EvalFrameEx ()
#37 0x080c96e5 in PyEval_EvalFrameEx ()
#38 0x080c96e5 in PyEval_EvalFrameEx ()
#39 0x080cb1f7 in PyEval_EvalCodeEx ()
#40 0x081136b6 in ?? ()
#41 0x0805cb97 in PyObject_Call ()
#42 0x080c7e04 in PyEval_EvalFrameEx ()
#43 0x080c96e5 in PyEval_EvalFrameEx ()
#44 0x080c96e5 in PyEval_EvalFrameEx ()
#45 0x080cb1f7 in PyEval_EvalCodeEx ()
#46 0x081136b6 in ?? ()
#47 0x0805cb97 in PyObject_Call ()
#48 0x080c7e04 in PyEval_EvalFrameEx ()
#49 0x080c96e5 in PyEval_EvalFrameEx ()
#50 0x080c96e5 in PyEval_EvalFrameEx ()
#51 0x080c96e5 in PyEval_EvalFrameEx ()
#52 0x080cb1f7 in PyEval_EvalCodeEx ()
#53 0x081136b6 in ?? ()
#54 0x0805cb97 in PyObject_Call ()
#55 0x080c7e04 in PyEval_EvalFrameEx ()
#56 0x080c96e5 in PyEval_EvalFrameEx ()
#57 0x080c96e5 in PyEval_EvalFrameEx ()
#58 0x080cb1f7 in PyEval_EvalCodeEx ()
#59 0x080c93fe in PyEval_EvalFrameEx ()
#60 0x080cb1f7 in PyEval_EvalCodeEx ()
#61 0x080c93fe in PyEval_EvalFrameEx ()
#62 0x080cb1f7 in PyEval_EvalCodeEx ()
#63 0x080c93fe in PyEval_EvalFrameEx ()
#64 0x080cb1f7 in PyEval_EvalCodeEx ()
#65 0x080cb347 in PyEval_EvalCode ()
#66 0x080ea818 in PyRun_FileExFlags ()
#67 0x080eaab9 in PyRun_SimpleFileExFlags ()
#68 0x08059335 in Py_Main ()
#69 0x080587f2 in main ()

After attaching to the Python process with gdb, the following stack trace seems to indicate that the appserver is locked on acquiring a thread. This is a call done in Python, so at least the problem doesn't seem to be in a compiled module.

#0  0xb7f60410 in __kernel_vsyscall ()
#1  0xb7f44d05 in sem_wait@@GLIBC_2.1 () from /lib/tls/i686/cmov/libpthread.so.0
#2  0x080f2445 in PyThread_acquire_lock ()
#3  0x080c33fc in PyEval_RestoreThread ()
#4  0x08070194 in ?? ()
#5  0x080c9ab3 in PyEval_EvalFrameEx ()
#6  0x080c96e5 in PyEval_EvalFrameEx ()
#7  0x080c96e5 in PyEval_EvalFrameEx ()
#8  0x080c96e5 in PyEval_EvalFrameEx ()
#9  0x080c96e5 in PyEval_EvalFrameEx ()
#10 0x080cb1f7 in PyEval_EvalCodeEx ()
#11 0x081136b6 in ?? ()
#12 0x0805cb97 in PyObject_Call ()
#13 0x08062bfb in ?? ()
#14 0x0805cb97 in PyObject_Call ()
#15 0x080c2e9c in PyEval_CallObjectWithKeywords ()
#16 0x080c27c4 in ?? ()
#17 0x080c9ab3 in PyEval_EvalFrameEx ()
#18 0x080cb1f7 in PyEval_EvalCodeEx ()
#19 0x080c93fe in PyEval_EvalFrameEx ()
#20 0x080c96e5 in PyEval_EvalFrameEx ()
#21 0x080c96e5 in PyEval_EvalFrameEx ()
#22 0x080cb1f7 in PyEval_EvalCodeEx ()
#23 0x080c93fe in PyEval_EvalFrameEx ()
#24 0x080cb1f7 in PyEval_EvalCodeEx ()
#25 0x080c93fe in PyEval_EvalFrameEx ()
#26 0x080c96e5 in PyEval_EvalFrameEx ()
#27 0x080c96e5 in PyEval_EvalFrameEx ()
#28 0x080c96e5 in PyEval_EvalFrameEx ()
#29 0x080c96e5 in PyEval_EvalFrameEx ()
#30 0x080c96e5 in PyEval_EvalFrameEx ()
#31 0x080c96e5 in PyEval_EvalFrameEx ()
#32 0x080cb1f7 in PyEval_EvalCodeEx ()
#33 0x081136b6 in ?? ()
#34 0x0805cb97 in PyObject_Call ()
#35 0x080c7e04 in PyEval_EvalFrameEx ()
#36 0x080c96e5 in PyEval_EvalFrameEx ()
#37 0x080c96e5 in PyEval_EvalFrameEx ()
#38 0x080c96e5 in PyEval_EvalFrameEx ()
#39 0x080cb1f7 in PyEval_EvalCodeEx ()
#40 0x081136b6 in ?? ()
#41 0x0805cb97 in PyObject_Call ()
#42 0x080c7e04 in PyEval_EvalFrameEx ()
#43 0x080c96e5 in PyEval_EvalFrameEx ()
#44 0x080c96e5 in PyEval_EvalFrameEx ()
#45 0x080cb1f7 in PyEval_EvalCodeEx ()
#46 0x081136b6 in ?? ()
#47 0x0805cb97 in PyObject_Call ()
#48 0x080c7e04 in PyEval_EvalFrameEx ()
#49 0x080c96e5 in PyEval_EvalFrameEx ()
#50 0x080c96e5 in PyEval_EvalFrameEx ()
#51 0x080c96e5 in PyEval_EvalFrameEx ()
#52 0x080cb1f7 in PyEval_EvalCodeEx ()
#53 0x081136b6 in ?? ()
#54 0x0805cb97 in PyObject_Call ()
#55 0x080c7e04 in PyEval_EvalFrameEx ()
#56 0x080c96e5 in PyEval_EvalFrameEx ()
#57 0x080c96e5 in PyEval_EvalFrameEx ()
#58 0x080cb1f7 in PyEval_EvalCodeEx ()
#59 0x080c93fe in PyEval_EvalFrameEx ()
#60 0x080cb1f7 in PyEval_EvalCodeEx ()
#61 0x080c93fe in PyEval_EvalFrameEx ()
#62 0x080cb1f7 in PyEval_EvalCodeEx ()
#63 0x080c93fe in PyEval_EvalFrameEx ()
#64 0x080cb1f7 in PyEval_EvalCodeEx ()
#65 0x080cb347 in PyEval_EvalCode ()
#66 0x080ea818 in PyRun_FileExFlags ()
#67 0x080eaab9 in PyRun_SimpleFileExFlags ()
#68 0x08059335 in Py_Main ()
#69 0x080587f2 in main ()

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-29:

#3

In an attempt to potentially isolate the problem further, the following multi-threaded application was written which strictly makes Storm calls. The database queries are those which were observed in the PostgreSQL log when running the above request to reproduce the problem. Unfortunately, this doesn't seem to reproduce the blocking problem which seems to indicate that the problem might be in the Zope3 threading pool.

import sys, os
import transaction
from threading import Thread
from zope.component import getUtility
from canonical.certify.application import load_config
from canonical.certify.interfaces import IStoreSelector, MAIN_STORE

THREAD_COUNT = 40
ITERATION_COUNT = 40

def do_function(thread_num):
    main_store = getUtility(IStoreSelector).get(MAIN_STORE)
    for i in range(ITERATION_COUNT):
        try:
            transaction.begin()
            result = main_store.execute("""SELECT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.canonical_id, hardware.category, hardware.c
            result = main_store.execute("""UPDATE hardware SET foo_time='2009-01-29 12:30:42.443242' WHERE hardware.id = 123""")
            result = main_store.execute("""SELECT certificate.completed, certificate.hardware_id, certificate.id, certificate.name, certificate.release, certificate.secu
            result = main_store.execute("""SELECT account.access_context_id, account.billing_city, account.billing_country, account.billing_postal_code, account.billing_
            result = main_store.execute("""SELECT COUNT(*) FROM certificate WHERE certificate.hardware_id = 123""")
            result = main_store.execute("""UPDATE hardware SET foo_time='2009-01-29 12:30:42.747807' WHERE hardware.id = 123""")
            result = main_store.execute("""SELECT DISTINCT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.canonical_id, hardware.category, h
            result = main_store.execute("""SELECT DISTINCT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.canonical_id, hardware.category, h
            result = main_store.execute("""SELECT DISTINCT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.canonical_id, hardware.category, h
            transaction.commit()
        except Exception, e:
            print e

def main(argv):
load_config(os.path.abspath("."), "application.conf")

    threads = []
    for i in range(THREAD_COUNT):
        thread = Thread(target=do_function, args=[i])
        threads.append(thread)
        thread.start()

    while True:
        alives = [t for t in threads if t.isAlive()]
        for alive in alives:
            try:
                alive.join(1)
            except KeyboardInterrupt:
                sys.exit(1)
        else:
            break

return 0

if __name__ == "__main__":
sys.exit(main(sys.argv))

In an attempt to potentially isolate the problem further, the following multi-threaded application was written which strictly makes Storm calls. The database queries are those which were observed in the PostgreSQL log when running the above request to reproduce the problem. Unfortunately, this doesn't seem to reproduce the blocking problem which seems to indicate that the problem might be in the Zope3 threading pool.

import sys, os
import transaction
from threading import Thread
from zope.component import getUtility
from canonical.certify.application import load_config
from canonical.certify.interfaces import IStoreSelector, MAIN_STORE

THREAD_COUNT = 40
ITERATION_COUNT = 40

def do_function(thread_num):
    main_store = getUtility(IStoreSelector).get(MAIN_STORE)
    for i in range(ITERATION_COUNT):
        try:
            transaction.begin()
            result = main_store.execute("""SELECT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.canonical_id, hardware.category, hardware.c
            result = main_store.execute("""UPDATE hardware SET foo_time='2009-01-29 12:30:42.443242' WHERE hardware.id = 123""")
            result = main_store.execute("""SELECT certificate.completed, certificate.hardware_id, certificate.id, certificate.name, certificate.release, certificate.secu
            result = main_store.execute("""SELECT account.access_context_id, account.billing_city, account.billing_country, account.billing_postal_code, account.billing_
            result = main_store.execute("""SELECT COUNT(*) FROM certificate WHERE certificate.hardware_id = 123""")
            result = main_store.execute("""UPDATE hardware SET foo_time='2009-01-29 12:30:42.747807' WHERE hardware.id = 123""")
            result = main_store.execute("""SELECT DISTINCT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.canonical_id, hardware.category, h
            result = main_store.execute("""SELECT DISTINCT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.canonical_id, hardware.category, h
            result = main_store.execute("""SELECT DISTINCT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.canonical_id, hardware.category, h
            transaction.commit()
        except Exception, e:
            print e

def main(argv):
    load_config(os.path.abspath("."), "application.conf")

threads = []
    for i in range(THREAD_COUNT):
        thread = Thread(target=do_function, args=[i])
        threads.append(thread)
        thread.start()

while True:
        alives = [t for t in threads if t.isAlive()]
        for alive in alives:
            try:
                alive.join(1)
            except KeyboardInterrupt:
                sys.exit(1)
        else:
            break

return 0

if __name__ == "__main__":
    sys.exit(main(sys.argv))

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-29:

#4

After discussing the problem with the Landscape team which happens to share some code, Thomas seemed to indicate that they have encountered a similar problem detailed in bug #294118. They have initially observed the problem when users would click on the submit button twice in a row. This basically reproduces the same observations detailed in this bug where the same row in the database gets updated concurrently. The bug is still open so there is no known solution yet.

The reason why the certification website has encountered the problem more regularly than the Landscape website is that our client is more aggressive when submitting information to the website compared to their client. So, a workaround would be to relax the client. However, this would only be a workaround to a problem which would remain lurking for many other potential situations. Ideally, locks should be handled reliably and deadlocks should not be possible even under heavy load.

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-29:

#5

Download full text (6.6 KiB)

After running the appserver with the trace module and the -t argument, python -m trace -t, and disabling the mail queue delivery to prevent repetitive calls from being made, the following few lines are output last. I'm still not sure if this provides further insight into the problem but worth noting for now.

...
postgres.py(199): return compile_sql_token(compile, expr, state)
--- modulename: expr, funcname: compile_sql_token
expr.py(1284): if is_safe_token(expr) and not compile.is_reserved_word(expr):
expr.py(1285): return expr
info.py(247): if state.context is TABLE and issubclass(expr, ClassAlias):
info.py(249): return table
--- modulename: expr, funcname: <genexpr>
expr.py(538): tables = set(compile(table, state, token=True) for table in tables)
--- modulename: info, funcname: compile_type
info.py(245): cls_info = get_cls_info(expr)
--- modulename: info, funcname: get_cls_info
info.py(47): if "__storm_class_info__" in cls.__dict__:
info.py(49): return cls.__dict__["__storm_class_info__"]
info.py(246): table = compile(cls_info.table, state)
--- modulename: postgres, funcname: compile_sql_token_postgres
postgres.py(196): if "." in expr and state.context in (TABLE, COLUMN_PREFIX):
postgres.py(199): return compile_sql_token(compile, expr, state)
--- modulename: expr, funcname: compile_sql_token
expr.py(1284): if is_safe_token(expr) and not compile.is_reserved_word(expr):
expr.py(1285): return expr
info.py(247): if state.context is TABLE and issubclass(expr, ClassAlias):
info.py(249): return table
--- modulename: expr, funcname: <genexpr>
expr.py(538): tables = set(compile(table, state, token=True) for table in tables)
expr.py(539): return ", ".join(sorted(tables))
expr.py(641): parameters = state.parameters
expr.py(642): state.pop()
--- modulename: expr, funcname: pop
expr.py(262): setattr(self, *self._stack.pop(-1))
expr.py(643): state.parameters[parameters_pos:parameters_pos] = parameters
expr.py(644): state.pop()
--- modulename: expr, funcname: pop
expr.py(262): setattr(self, *self._stack.pop(-1))
expr.py(645): state.pop()
--- modulename: expr, funcname: pop
expr.py(262): setattr(self, *self._stack.pop(-1))
expr.py(646): return "".join(tokens)
database.py(201): params = state.parameters
database.py(202): statement = convert_param_marks(statement, "?", self.param_mark)
--- modulename: database, funcname: convert_param_marks
database.py(380): if from_param_mark == to_param_mark or from_param_mark not in statement:
database.py(382): tokens = statement.split("'")
database.py(383): for i in range(0, len(tokens), 2):
database.py(384): tokens[i] = tokens[i].replace(from_param_mark, to_param_mark)
database.py(383): for i in range(0, len(tokens), 2):
database.py(385): return "'".join(tokens)
database.py(203): raw_cursor = self.raw_execute(statement, params)
--- modulename: postgres, funcname: raw_execute
postgres.py(290): if type(statement) is unicode:
postgres.py(292): statement = statement.encod...

After running the appserver with the trace module and the -t argument, python -m trace -t, and disabling the mail queue delivery to prevent repetitive calls from being made, the following few lines are output last. I'm still not sure if this provides further insight into the problem but worth noting for now.

...
postgres.py(199):     return compile_sql_token(compile, expr, state)
 --- modulename: expr, funcname: compile_sql_token
expr.py(1284):     if is_safe_token(expr) and not compile.is_reserved_word(expr):
expr.py(1285):         return expr
info.py(247):     if state.context is TABLE and issubclass(expr, ClassAlias):
info.py(249):     return table
 --- modulename: expr, funcname: <genexpr>
expr.py(538):             tables = set(compile(table, state, token=True) for table in tables)
 --- modulename: info, funcname: compile_type
info.py(245):     cls_info = get_cls_info(expr)
 --- modulename: info, funcname: get_cls_info
info.py(47):     if "__storm_class_info__" in cls.__dict__:
info.py(49):         return cls.__dict__["__storm_class_info__"]
info.py(246):     table = compile(cls_info.table, state)
 --- modulename: postgres, funcname: compile_sql_token_postgres
postgres.py(196):     if "." in expr and state.context in (TABLE, COLUMN_PREFIX):
postgres.py(199):     return compile_sql_token(compile, expr, state)
 --- modulename: expr, funcname: compile_sql_token
expr.py(1284):     if is_safe_token(expr) and not compile.is_reserved_word(expr):
expr.py(1285):         return expr
info.py(247):     if state.context is TABLE and issubclass(expr, ClassAlias):
info.py(249):     return table
 --- modulename: expr, funcname: <genexpr>
expr.py(538):             tables = set(compile(table, state, token=True) for table in tables)
expr.py(539):             return ", ".join(sorted(tables))
expr.py(641):         parameters = state.parameters
expr.py(642):         state.pop()
 --- modulename: expr, funcname: pop
expr.py(262):         setattr(self, *self._stack.pop(-1))
expr.py(643):         state.parameters[parameters_pos:parameters_pos] = parameters
expr.py(644):     state.pop()
 --- modulename: expr, funcname: pop
expr.py(262):         setattr(self, *self._stack.pop(-1))
expr.py(645):     state.pop()
 --- modulename: expr, funcname: pop
expr.py(262):         setattr(self, *self._stack.pop(-1))
expr.py(646):     return "".join(tokens)
database.py(201):             params = state.parameters
database.py(202):         statement = convert_param_marks(statement, "?", self.param_mark)
 --- modulename: database, funcname: convert_param_marks
database.py(380):     if from_param_mark == to_param_mark or from_param_mark not in statement:
database.py(382):     tokens = statement.split("'")
database.py(383):     for i in range(0, len(tokens), 2):
database.py(384):         tokens[i] = tokens[i].replace(from_param_mark, to_param_mark)
database.py(383):     for i in range(0, len(tokens), 2):
database.py(385):     return "'".join(tokens)
database.py(203):         raw_cursor = self.raw_execute(statement, params)
 --- modulename: postgres, funcname: raw_execute
postgres.py(290):         if type(statement) is unicode:
postgres.py(292):             statement = statement.encode("UTF-8")
postgres.py(293):         return Connection.raw_execute(self, statement, params)
 --- modulename: database, funcname: raw_execute
database.py(276):         raw_cursor = self.build_raw_cursor()
 --- modulename: database, funcname: build_raw_cursor
database.py(263):         return self._raw_connection.cursor()
database.py(277):         trace("connection_raw_execute", self, raw_cursor,
database.py(278):               statement, params or ())
 --- modulename: tracer, funcname: trace
tracer.py(90):     for tracer in _tracers:
tracer.py(91):         attr = getattr(tracer, name, None)
tracer.py(92):         if attr:
tracer.py(93):             attr(*args, **kwargs)
 --- modulename: stormtracer, funcname: connection_raw_execute
stormtracer.py(16):         now = time.time()
stormtracer.py(17):         for entry in OOPSStorm.get_for_tracing():
 --- modulename: stormtracer, funcname: get_for_tracing
stormtracer.py(67):         for entry in self._global:
 --- modulename: weakref, funcname: __iter__
weakref.py(309):         return self.iterkeys()
 --- modulename: weakref, funcname: iterkeys
weakref.py(303):         for wr in self.data.iterkeys():
stormtracer.py(69):         try:
stormtracer.py(70):             for entry in self._local.logs:
 --- modulename: weakref, funcname: __iter__
weakref.py(309):         return self.iterkeys()
 --- modulename: weakref, funcname: iterkeys
weakref.py(303):         for wr in self.data.iterkeys():
weakref.py(304):             obj = wr()
weakref.py(305):             if obj is not None:
weakref.py(306):                 yield obj
stormtracer.py(71):                 yield entry
stormtracer.py(18):             entry.connection_raw_execute(now, connection, raw_cursor,
stormtracer.py(19):                                          statement, params)
 --- modulename: stormtracer, funcname: connection_raw_execute
stormtracer.py(79):         params = tuple(repr((isinstance(param, Variable) and
stormtracer.py(81):                        for param in params)
 --- modulename: stormtracer, funcname: <genexpr>
stormtracer.py(79):         params = tuple(repr((isinstance(param, Variable) and
 --- modulename: stormtracer, funcname: <genexpr>
stormtracer.py(82):         self._statements.append([statement, params, now, None, None])
stormtracer.py(17):         for entry in OOPSStorm.get_for_tracing():
 --- modulename: stormtracer, funcname: get_for_tracing
stormtracer.py(70):             for entry in self._local.logs:
 --- modulename: weakref, funcname: iterkeys
weakref.py(303):         for wr in self.data.iterkeys():
tracer.py(90):     for tracer in _tracers:
database.py(279):         if params:
database.py(280):             args = (statement, tuple(self.to_database(params)))
 --- modulename: postgres, funcname: to_database
postgres.py(301):         for param in params:
postgres.py(302):             if isinstance(param, Variable):
postgres.py(303):                 param = param.get(to_db=True)
postgres.py(304):             if isinstance(param, (datetime, date, time, timedelta)):
postgres.py(306):             elif isinstance(param, unicode):
postgres.py(308):             elif isinstance(param, str):
postgres.py(309):                 yield psycopg2.Binary(param)
 --- modulename: postgres, funcname: to_database
postgres.py(301):         for param in params:
database.py(283):         try:
database.py(284):             self._check_disconnect(raw_cursor.execute, *args)
 --- modulename: database, funcname: _check_disconnect
database.py(324):         try:
database.py(325):             return function(*args, **kwargs)

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-29:

#6

To follow up on the above trace, the last line was:

database.py(325): return function(*args, **kwargs)

So, by placing a print statement above that line to display the args, the following line is output:

('UPDATE hardware SET secure_id=E%s, confidential=%s, releases=ARRAY[%s], category=E%s, certification=E%s, soft_lock_time=%s, aliases=%s, canonical_id=E%s, foo_time=%s, make=%s, related_hardware_id=%s, model=%s, website=E%s, "timestamp"=%s, access_context_id=%s, account_id=%s, record_type_id=E%s WHERE hardware.id = %s', (<psycopg2._psycopg.Binary object at 0xbcfdb78>, False, <psycopg2._psycopg.Binary object at 0xbcfdcf8>, <psycopg2._psycopg.Binary object at 0xbc1fa28>, <psycopg2._psycopg.Binary object at 0xbc1fa40>, None, '', <psycopg2._psycopg.Binary object at 0xbd63b90>, '2009-01-29 16:24:19.138842', 'Dell', 313, 'PowerEdge 2900', <psycopg2._psycopg.Binary object at 0xbd63080>, '2009-01-28 17:44:06', 33, 15, <psycopg2._psycopg.Binary object at 0xbd63f98>, 123))

This is confirmed by the PostgreSQL database which also shows a few instances of that very same query in the pg_stat_activity table:

certify_trunk_main# select current_query from pg_stat_activity where current_query like 'UPDATE%' group by current_query;
UPDATE hardware SET secure_id=E'a0020000003K7XV', confidential=false, releases=ARRAY['8.04 LTS'], category=E'Server', certification=E'Compatibility', soft_lock_time=NULL, aliases='', canonical_id=E'200712-203', foo_time='2009-01-29 16:24:19.138842', make='Dell', related_hardware_id=313, model='PowerEdge 2900', website=E'', "timestamp"='2009-01-28 17:44:06', access_context_id=33, account_id=15, record_type_id=E'01220000000D1pE' WHERE hardware.id = 123

There are actually 9 instances of that query in the table which happens to correspond to the same number of processes output by the ps command looking like: UPDATE waiting.

Furthermore, there are also 9 rows matching the process ids for the above queries where locks have not yet been granted in the pg_locks table. The trace, the processes and the database are all in agreement.

To follow up on the above trace, the last line was:

database.py(325): return function(*args, **kwargs)

So, by placing a print statement above that line to display the args, the following line is output:

('UPDATE hardware SET secure_id=E%s, confidential=%s, releases=ARRAY[%s], category=E%s, certification=E%s, soft_lock_time=%s, aliases=%s, canonical_id=E%s, foo_time=%s, make=%s, related_hardware_id=%s, model=%s, website=E%s, "timestamp"=%s, access_context_id=%s, account_id=%s, record_type_id=E%s WHERE hardware.id = %s', (<psycopg2._psycopg.Binary object at 0xbcfdb78>, False, <psycopg2._psycopg.Binary object at 0xbcfdcf8>, <psycopg2._psycopg.Binary object at 0xbc1fa28>, <psycopg2._psycopg.Binary object at 0xbc1fa40>, None, '', <psycopg2._psycopg.Binary object at 0xbd63b90>, '2009-01-29 16:24:19.138842', 'Dell', 313, 'PowerEdge 2900', <psycopg2._psycopg.Binary object at 0xbd63080>, '2009-01-28 17:44:06', 33, 15, <psycopg2._psycopg.Binary object at 0xbd63f98>, 123))

This is confirmed by the PostgreSQL database which also shows a few instances of that very same query in the pg_stat_activity table:

certify_trunk_main# select current_query from pg_stat_activity where current_query like 'UPDATE%' group by current_query;
  UPDATE hardware SET secure_id=E'a0020000003K7XV', confidential=false, releases=ARRAY['8.04 LTS'], category=E'Server', certification=E'Compatibility', soft_lock_time=NULL, aliases='', canonical_id=E'200712-203', foo_time='2009-01-29 16:24:19.138842', make='Dell', related_hardware_id=313, model='PowerEdge 2900', website=E'', "timestamp"='2009-01-28 17:44:06', access_context_id=33, account_id=15, record_type_id=E'01220000000D1pE' WHERE hardware.id = 123

There are actually 9 instances of that query in the table which happens to correspond to the same number of processes output by the ps command looking like: UPDATE waiting.

Furthermore, there are also 9 rows matching the process ids for the above queries where locks have not yet been granted in the pg_locks table. The trace, the processes and the database are all in agreement.

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-29:

#7

Download full text (4.6 KiB)

In the pg_locks table, the 9 rows matching the blocked process ids only has one row in ShareLock mode whereas all the others are in ExclusiveLock mode. To focus on the former process, the following lines are output in the PostgreSQL log for the corresponding transaction. The interesting thing to notice is that the blocking query seems to be the first one in the list:

UPDATE hardware SET secure_id=E'a0020000003K7XV', confidential=false, releases=ARRAY['8.04 LTS'], category=E'Server', certification=E'Compatibility', soft_lock_time=NULL, aliases='', canonical_id=E'200712-203', foo_time='2009-01-29 16:24:19.138842', make='Dell', related_hardware_id=313, model='PowerEdge 2900', website=E'', "timestamp"='2009-01-28 17:44:06', access_context_id=33, account_id=15, record_type_id=E'01220000000D1pE' WHERE hardware.id = 123
SELECT 1 FROM record_type WHERE record_type.secure_id = E'01220000000D1pE'
SELECT record_type.description, record_type.name, record_type.sobject_type FROM record_type WHERE record_type.secure_id = E'01220000000D1pE'
SELECT 1 FROM hardware WHERE hardware.id = 123
SELECT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.architecture, hardware.canonical_id, hardware.category, hardware.certification, hardware.confidential, hardware.foo_time, hardware.make, hardware.model, hardware.record_type_id, hardware.related_hardware_id, hardware.releases, hardware.secure_id, hardware.soft_lock_time, hardware."timestamp", hardware.website FROM hardware WHERE hardware.id = 123
UPDATE hardware SET foo_time='2009-01-29 16:24:22.664845' WHERE hardware.id = 123
UPDATE hardware SET foo_time='2009-01-29 16:24:23.162857' WHERE hardware.id = 123
SELECT 1 FROM account WHERE account.id = 15
SELECT account.access_context_id, account.billing_city, account.billing_country, account.billing_postal_code, account.billing_state, account.billing_street, account.fax, account.name, account.phone, account.record_type_id, account.secure_id, account."timestamp", account.type, account.website FROM account WHERE account.id = 15
SELECT 1 FROM hardware WHERE hardware.id = 313
SELECT DISTINCT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.canonical_id, hardware.category, hardware.certification, hardware.confidential, hardware.foo_time, hardware.id, hardware.make, hardware.model, hardware.record_type_id, hardware.related_hardware_id, hardware.releases, hardware.secure_id, hardware.soft_lock_time, hardware."timestamp", hardware.website FROM person JOIN person_account AS pa1 ON person.id = pa1.person_id JOIN person_account AS pa2 ON pa1.account_id = pa2.account_id JOIN hardware ON hardware.account_id = pa2.account_id WHERE hardware.id = 313
UPDATE hardware SET secure_id=E'a0020000008Lhxq', confidential=false, releases='{}', category=E'Server', certification=E'Compatibility', soft_lock_time=NULL, aliases='', canonical_id=E'200810-1008', foo_time=NULL, make='HP', related_hardware_id=NULL, model='ProLiant Bak', website=NULL, "timestamp"='2009-01-23 21:01:31', access_context_id=21, account_id=1, record_type_id=E'01220000000DIpN' WHERE hardware.id = 313
SELECT DISTINCT hardware.access_context_id, hardware.account_id, hardware.alias...

In the pg_locks table, the 9 rows matching the blocked process ids only has one row in ShareLock mode whereas all the others are in ExclusiveLock mode. To focus on the former process, the following lines are output in the PostgreSQL log for the corresponding transaction. The interesting thing to notice is that the blocking query seems to be the first one in the list:

UPDATE hardware SET secure_id=E'a0020000003K7XV', confidential=false, releases=ARRAY['8.04 LTS'], category=E'Server', certification=E'Compatibility', soft_lock_time=NULL, aliases='', canonical_id=E'200712-203', foo_time='2009-01-29 16:24:19.138842', make='Dell', related_hardware_id=313, model='PowerEdge 2900', website=E'', "timestamp"='2009-01-28 17:44:06', access_context_id=33, account_id=15, record_type_id=E'01220000000D1pE' WHERE hardware.id = 123
SELECT 1 FROM record_type WHERE record_type.secure_id = E'01220000000D1pE'
SELECT record_type.description, record_type.name, record_type.sobject_type FROM record_type WHERE record_type.secure_id = E'01220000000D1pE'
SELECT 1 FROM hardware WHERE hardware.id = 123
SELECT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.architecture, hardware.canonical_id, hardware.category, hardware.certification, hardware.confidential, hardware.foo_time, hardware.make, hardware.model, hardware.record_type_id, hardware.related_hardware_id, hardware.releases, hardware.secure_id, hardware.soft_lock_time, hardware."timestamp", hardware.website FROM hardware WHERE hardware.id = 123
UPDATE hardware SET foo_time='2009-01-29 16:24:22.664845' WHERE hardware.id = 123
UPDATE hardware SET foo_time='2009-01-29 16:24:23.162857' WHERE hardware.id = 123
SELECT 1 FROM account WHERE account.id = 15
SELECT account.access_context_id, account.billing_city, account.billing_country, account.billing_postal_code, account.billing_state, account.billing_street, account.fax, account.name, account.phone, account.record_type_id, account.secure_id, account."timestamp", account.type, account.website FROM account WHERE account.id = 15
SELECT 1 FROM hardware WHERE hardware.id = 313
SELECT DISTINCT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.canonical_id, hardware.category, hardware.certification, hardware.confidential, hardware.foo_time, hardware.id, hardware.make, hardware.model, hardware.record_type_id, hardware.related_hardware_id, hardware.releases, hardware.secure_id, hardware.soft_lock_time, hardware."timestamp", hardware.website FROM person JOIN person_account AS pa1 ON person.id = pa1.person_id JOIN person_account AS pa2 ON pa1.account_id = pa2.account_id JOIN hardware ON hardware.account_id = pa2.account_id WHERE hardware.id = 313
UPDATE hardware SET secure_id=E'a0020000008Lhxq', confidential=false, releases='{}', category=E'Server', certification=E'Compatibility', soft_lock_time=NULL, aliases='', canonical_id=E'200810-1008', foo_time=NULL, make='HP', related_hardware_id=NULL, model='ProLiant Bak', website=NULL, "timestamp"='2009-01-23 21:01:31', access_context_id=21, account_id=1, record_type_id=E'01220000000DIpN' WHERE hardware.id = 313
SELECT DISTINCT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.canonical_id, hardware.category, hardware.certification, hardware.confidential, hardware.foo_time, hardware.id, hardware.make, hardware.model, hardware.record_type_id, hardware.related_hardware_id, hardware.releases, hardware.secure_id, hardware.soft_lock_time, hardware."timestamp", hardware.website FROM person JOIN person_account AS pa1 ON person.id = pa1.person_id JOIN person_account AS pa2 ON pa1.account_id = pa2.account_id JOIN hardware ON hardware.account_id = pa2.account_id WHERE hardware.id = 313
SELECT COUNT(*) FROM certificate WHERE certificate.hardware_id = 123
SELECT certificate.completed, certificate.hardware_id, certificate.id, certificate.name, certificate.release, certificate.secure_id, certificate.status, certificate."timestamp" FROM certificate WHERE certificate.hardware_id = 123 ORDER BY certificate.name DESC LIMIT 1 OFFSET 0
SELECT DISTINCT hardware.access_context_id, hardware.account_id, hardware.aliases, hardware.canonical_id, hardware.category, hardware.certification, hardware.confidential, hardware.foo_time, hardware.id, hardware.make, hardware.model, hardware.record_type_id, hardware.related_hardware_id, hardware.releases, hardware.secure_id, hardware.soft_lock_time, hardware."timestamp", hardware.website FROM person JOIN person_account AS pa1 ON person.id = pa1.person_id JOIN person_account AS pa2 ON pa1.account_id = pa2.account_id JOIN hardware ON hardware.account_id = pa2.account_id WHERE hardware.id = 123

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-29:

#8

For completeness purposes, anyone trying to reproduce this bug would be recommended to make the following changes to their postgresql.conf file:

log_error_verbosity = verbose
log_line_prefix = '%t %x '
log_lock_waits = on

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-29:

#9

There is apparently a Zope3 component called DeadlockDebugger which claims to add a hook so that a deadlocked process can be debugged. The component is available from the Zope svn repository:

svn co svn://svn.zope.org/repos/main/z3c.deadlockdebugger/trunk

After having enabled the module and starting the appserver, it indeed seems to live up to its claim. However, once the appserver is locked, the page no longer responds. This makes sense because the pool of threads is probably locked which prevents another thread from being acquired from the pool for debugging purposes.

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-29:

#10

Another approach attempted was to log stack frames to file from a separate thread. For example, the following code repeatedly writes the frames every 10 seconds. However, when Zope3 blocks, this thread also seems to stop.

def runforever(name):
    import threadframe, traceback, time, sys
    file = open(name, "w")
    while True:
        file.write(">>>>>%s, number of threads = %d\n" % (time.strftime("%H:%M:%S"), len(threadframe.dict())))
        time.sleep(10)
        for id, frame in threadframe.dict().iteritems():
            file.write("===== [%d] %s" % (id, sys.getrefcount(frame)))
            traceback.print_stack(frame, file=file)
            file.write("\n")
            file.flush()

import thread
thread.start_new_thread(runforever, ("/tmp/thread1.log",))

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-29:

#11

For completeness purposes, the following strace output confirms once again that the threads are blocked on a call to lock.acquire():

# strace -fp `cat certify_web.pid`
Process 18255 attached with 14 threads - interrupt to quit
[pid 18223] futex(0x81c2740, 0x80 /* FUTEX_??? */, 0 <unfinished ...>
[pid 18224] futex(0x81c2740, 0x80 /* FUTEX_??? */, 0 <unfinished ...>
[pid 18225] futex(0x81c2740, 0x80 /* FUTEX_??? */, 0 <unfinished ...>
[pid 18226] futex(0x81c2740, 0x80 /* FUTEX_??? */, 0 <unfinished ...>
[pid 18246] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 18247] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 18248] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 18249] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 18250] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 18251] futex(0x81c2740, 0x80 /* FUTEX_??? */, 0 <unfinished ...>
[pid 18252] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 18253] futex(0xb76fdf8, 0x80 /* FUTEX_??? */, 2 <unfinished ...>
[pid 18254] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 18255] restart_syscall(<... resuming interrupted call ...>

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-29:

#12

To follow up on the gdb session, I attempted to get a stack trace from there by calling the C function PyRun_SimpleString. At first, this resulted in a STOP signal being sent. Then, by setting the unwindonsignal option, it was finally possible to obtain a stack trace:

(gdb) set unwindonsignal on
(gdb) call PyRun_SimpleString("import sys, traceback; sys.stderr=open('/tmp/trace.log','w',0); traceback.print_stack()")
$1 = 0

In the output, there is unfortunately not much more information than has already been gathered. It basically shows that Python is blocked on the UPDATE query made to the PostgreSQL database:

  ...
  File "/srv/trunk.certification.canonical.com/certify-storm/storm/database.py", line 284, in raw_execute
    self._check_disconnect(raw_cursor.execute, *args)
  File "/srv/trunk.certification.canonical.com/certify-storm/storm/database.py", line 325, in _check_disconnect
    return function(*args, **kwargs)
  File "<string>", line 1, in <module>

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-30:

#13

After looking more closely at the threads using gdb, it seems that most of them are not really blocked while waiting to acquire a lock but they are actually blocked on a call to psycopg2. One potential solution was to try upgrading to the latest version of the library but, after building and installing, the same problem occured. Unfortunately, this doesn't leave much to go on considering that the Storm code only really uses psycopg2 to obtain a raw connection.

(gdb) thread 2
(gdb) where
#0 0xb7f1c410 in __kernel_vsyscall ()
#1 0xb7e43c07 in poll () from /lib/tls/i686/cmov/libc.so.6
#2 0xb5d5910a in ?? () from /usr/lib/libpq.so.5
#3 0xb5d59246 in ?? () from /usr/lib/libpq.so.5
#4 0xb5d592c3 in ?? () from /usr/lib/libpq.so.5
#5 0xb5d58972 in PQgetResult () from /usr/lib/libpq.so.5
#6 0xb5d58a8c in ?? () from /usr/lib/libpq.so.5
#7 0xb5d7f9b2 in ?? () from /usr/lib/python2.5/site-packages/psycopg2/_psycopg.so
#8 0xb5d85cdc in ?? () from /usr/lib/python2.5/site-packages/psycopg2/_psycopg.so
#9 0xb5d86463 in ?? () from /usr/lib/python2.5/site-packages/psycopg2/_psycopg.so
#10 0x0805cb97 in PyObject_Call (func=0x1, arg=0xba3c8ac, kw=0xba2968c) at ../Objects/abstract.c:1861
#11 0x080c7e04 in PyEval_EvalFrameEx (f=0xa0d500c, throwflag=0) at ../Python/ceval.c:3853
#12 0x080cb1f7 in PyEval_EvalCodeEx (co=0x8df2020, globals=0x8df03e4, locals=0x0, args=0xbc2e2b8, argcount=4, kws=0x0, kwcount=0, defs=0x0, defcount=0, closure=0x0)
at ../Python/ceval.c:2836
...

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-30:

#14

Another potential solution was to use the PostgresTimeoutTracer to prevent runaway requests, to which this blocking situation might apply as well. So, enabling a timeout of 60 seconds and reproducing the problem, there were five blocked processes:

postgres 21427 0.0 0.6 42280 5064 ? Ss 19:59 0:00 postgres: cr3 certify_trunk_main [local] UPDATE waiting

After waiting the specified timeout, only one process was remaining. However, that particular process was causing the appserver to cease responding. Note that this solution is just a workaround because a request blocked on UPDATE is not necessarily the same as a runaway request. The difference is that the former is just waiting whereas the other is assumed to be performing a request much longer than expected.

Revision history for this message

James Henstridge (jamesh) wrote on 2009-01-30:

#15

To debug this, you could try issuing the following query when things are deadlocked:

select datname, usename, xact_start, waiting, current_query from pg_stat_activity;

This should tell you what is going on from PostgreSQL's point of view. The deadlocked connections should show up with "waiting" set to true. If you have any "<IDLE> in transaction" connections, they are possible causes of the deadlock.

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-30:

#16

After I reproduce the problem once again and run that query on the pg_stat_activity table, I see the same three updates which are the only queries in the waiting state:

UPDATE hardware SET record_type_id=E'01220000000D1pE', confidential=false, aliases='', releases=ARRAY['8.04 LTS'], account_id=15, access_context_id=33, certification=E'Compatibility', category=E'Server', related_hardware_id=313, website=E'', "timestamp"='2009-01-29 18:28:36', foo_time='2009-01-29 23:38:25.007643', soft_lock_time=NULL, make='Dell', secure_id=E'a0020000003K7XV', model='PowerEdge 2900', canonical_id=E'200712-203' WHERE hardware.id = 123

This actually corresponds to the same three processes described as waiting returned by the ps command:

postgres 6568 0.0 0.6 42280 5008 ? Ss 23:38 0:00 postgres: cr3 certify_trunk_main [local] UPDATE waiting

Strangely though, when I attach to the appserver process using gdb and iterate over each thread, I seem to count twice as many threads blocked on psycopg2:

#0 0xb7f96410 in __kernel_vsyscall ()
#1 0xb7f7a589 in __lll_lock_wait () from /lib/tls/i686/cmov/libpthread.so.0
#2 0xb7f75ba6 in _L_lock_95 () from /lib/tls/i686/cmov/libpthread.so.0
#3 0xb7f7558a in pthread_mutex_lock () from /lib/tls/i686/cmov/libpthread.so.0
#4 0xb7852871 in ?? () from /usr/lib/python2.5/site-packages/psycopg2/_psycopg.so
#5 0xb7853154 in ?? () from /usr/lib/python2.5/site-packages/psycopg2/_psycopg.so
#6 0x0805cb97 in PyObject_Call (func=0x80, arg=0xbad6f8c, kw=0x89999bc) at ../Objects/abstract.c:1861
#7 0x080c7e04 in PyEval_EvalFrameEx (f=0xbbb36e4, throwflag=0) at ../Python/ceval.c:3853
...

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-30:

#17

Upon closer inspection, it seems likely that the problem might be in the Connection._check_disconnect method of the storm.database module. It handles the situation when there is a database disconnection but lets other exceptions such as the ProgrammingError go through:

    def _check_disconnect(self, function, *args, **kwargs):
        try:
            return function(*args, **kwargs)
        except DatabaseError, exc:
            if self.is_disconnection_error(exc):
                self._state = STATE_DISCONNECTED
                self._raw_connection = None
                raise DisconnectionError(str(exc))
            else:
                raise

It seems that exception might be percolating up to the point where one of the Zope3 threads doesn't commit a transaction. This might be a likely reason why an UPDATE process might endup waiting on an <IDLE> in transaction process. This is a deadlock until the latter process commits its transaction.

Now, whether the ProgrammingError exception should be handled in this method or higher up the stack remains to be determined.

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-30:

#18

Here is a database session which might prove interesting for future debugging purposes. This first determines the query which seems to be waiting and then finds the corresponding query on which it's waiting:

# select transactionid from pg_locks where pid in (select procpid from pg_stat_activity where current_query like 'UPDATE%') and mode = 'ShareLock' and granted is false;
transactionid | pid
---------------+-------
55215 | 14866
(1 row)

This might be worth explaining a bit. First, the subselect looks for UPDATE queries which seemed to be waiting as determined by the output of the ps command. Second, the ShareLock is the mode in which transactions wait on each other, hence the word "share". Last, the query must not have been granted which what we're looking for in the first place.

Given the above transactionid, this should be queried again in the pg_locks table in order to determine which two processes are actually sharing the same transaction:

# select pid from pg_locks where transactionid = 55215;
pid
-------
14880
14866
(2 rows)

So, the other transaction could now be queried in the pg_stat_activity table in order to determine what it's current query might be:

# select current_query from pg_stat_activity where procpid = 14880;
current_query
-----------------------
<IDLE> in transaction
(1 row)

The UPDATE request therefore seems to be waiting on an idle transaction, which is an application side problem where the transaction is not being cleaned up properly.

Revision history for this message

Marc Tardif (cr3) wrote on 2009-01-30:

#19

Download full text (5.5 KiB)

For completeness purposes, here is the stack trace of the concurrent update exception as shown in the Zope 3 logs:

2009-01-30T17:29:31 ERROR SiteError http://localhost:8080/hardware/200712-203/index.html
Traceback (most recent call last):
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/publisher/publish.py", line 136, in publish
    result = publication.callObject(request, obj)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/app/publication/zopepublication.py", line 167, in callObject
    return mapply(ob, request.getPositionalArguments(), request)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/publisher/publish.py", line 108, in mapply
    return debug_call(obj, args)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/publisher/publish.py", line 114, in debug_call
    return obj(*args)
  File "/srv/trunk.certification.canonical.com/certify-web/src/canonical/certify/browser/page.py", line 61, in __call__
    result = provider.render()
  File "/srv/trunk.certification.canonical.com/certify-web/src/canonical/content/provider.py", line 55, in render
    return self.template(*args, **kwargs)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/app/pagetemplate/viewpagetemplatefile.py", line 83, in __call__
    return self.im_func(im_self, *args, **kw)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/app/pagetemplate/viewpagetemplatefile.py", line 51, in __call__
    sourceAnnotations=getattr(debug_flags, 'sourceAnnotations', 0),
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/pagetemplate/pagetemplate.py", line 115, in pt_render
    strictinsert=0, sourceAnnotations=sourceAnnotations)()
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/tal/talinterpreter.py", line 271, in __call__
    self.interpret(self.program)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/tal/talinterpreter.py", line 346, in interpret
    handlers[opcode](self, args)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/tal/talinterpreter.py", line 536, in do_optTag_tal
    self.do_optTag(stuff)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/tal/talinterpreter.py", line 521, in do_optTag
    return self.no_tag(start, program)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/tal/talinterpreter.py", line 516, in no_tag
    self.interpret(program)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/tal/talinterpreter.py", line 346, in interpret
    handlers[opcode](self, args)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/tal/talinterpreter.py", line 623, in do_insertText_tal
    text = self.engine.evaluateText(stuff[0])
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/tales/tales.py", line 704, in evaluateText
    text = self.evaluate(expr)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/tales/tales.py", line 696, in evaluate
    return expression(self)
  File "/srv/trunk.certification.canonical.com/certify-zope3/src/zope/tales/expressions.py", line...

certify-web

Concurrent requests requiring updates results in blocked threads

Bug Description

Duplicates of this bug

Other bug subscribers

Remote bug watches

Changed in certify-web:
status:	In Progress → Fix Released