Overte C++ Documentation
DeadlockWatchdog.h
1 //
2 // DeadlockWatchdog.h
3 // interface/src
4 //
5 // Split from Application.cpp by HifiExperiments on 3/30/24
6 // Created by Andrzej Kapolka on 5/10/13.
7 // Copyright 2013 High Fidelity, Inc.
8 // Copyright 2020 Vircadia contributors.
9 // Copyright 2022-2023 Overte e.V.
10 //
11 // Distributed under the Apache License, Version 2.0.
12 // See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
13 // SPDX-License-Identifier: Apache-2.0
14 //
15 
16 #ifndef hifi_DeadlockWatchdog_h
17 #define hifi_DeadlockWatchdog_h
18 
19 #include <QThread>
20 
21 #include <NumericalConstants.h>
22 #include <SharedUtil.h>
23 #include <crash-handler/CrashHandler.h>
24 
25 #include "InterfaceLogging.h"
26 #include "SimpleMovingAverage.h"
27 
28 class DeadlockWatchdogThread : public QThread {
29 public:
30  static const unsigned long HEARTBEAT_UPDATE_INTERVAL_SECS = 1;
31  static const unsigned long MAX_HEARTBEAT_AGE_USECS = 120 * USECS_PER_SECOND; // 2 mins with no checkin probably a deadlock
32  static const int WARNING_ELAPSED_HEARTBEAT = 500 * USECS_PER_MSEC; // warn if elapsed heartbeat average is large
33  static const int HEARTBEAT_SAMPLES = 100000; // ~5 seconds worth of samples
34 
35  // Set the heartbeat on launch
36  DeadlockWatchdogThread() {
37  setObjectName("Deadlock Watchdog");
38  // Give the heartbeat an initial value
39  _heartbeat = usecTimestampNow();
40  _paused = false;
41  connect(qApp, &QCoreApplication::aboutToQuit, [this] {
42  _quit = true;
43  });
44  }
45 
46  void setMainThreadID(Qt::HANDLE threadID) {
47  _mainThreadID = threadID;
48  }
49 
50  static void updateHeartbeat() {
51  auto now = usecTimestampNow();
52  auto elapsed = now - _heartbeat;
53  _movingAverage.addSample(elapsed);
54  _heartbeat = now;
55  }
56 
57  void deadlockDetectionCrash() {
58  auto &ch = CrashHandler::getInstance();
59 
60  ch.setAnnotation("_mod_faulting_tid", std::to_string((uint64_t)_mainThreadID));
61  ch.setAnnotation("deadlock", "1");
62  uint32_t* crashTrigger = nullptr;
63  *crashTrigger = 0xDEAD10CC;
64  }
65 
66  static void withPause(const std::function<void()>& lambda) {
67  pause();
68  lambda();
69  resume();
70  }
71  static void pause() {
72  _paused = true;
73  }
74 
75  static void resume() {
76  // Update the heartbeat BEFORE resuming the checks
77  updateHeartbeat();
78  _paused = false;
79  }
80 
81  void run() override {
82  while (!_quit) {
83  QThread::sleep(HEARTBEAT_UPDATE_INTERVAL_SECS);
84  // Don't do heartbeat detection under nsight
85  if (_paused) {
86  continue;
87  }
88  uint64_t lastHeartbeat = _heartbeat; // sample atomic _heartbeat, because we could context switch away and have it updated on us
89  uint64_t now = usecTimestampNow();
90  auto lastHeartbeatAge = (now > lastHeartbeat) ? now - lastHeartbeat : 0;
91  auto elapsedMovingAverage = _movingAverage.getAverage();
92 
93  if (elapsedMovingAverage > _maxElapsedAverage * 1.1f) {
94 #if !defined(NDEBUG)
95  qCDebug(interfaceapp_deadlock) << "DEADLOCK WATCHDOG WARNING:"
96  << "lastHeartbeatAge:" << lastHeartbeatAge
97  << "elapsedMovingAverage:" << elapsedMovingAverage
98  << "maxElapsed:" << _maxElapsed
99  << "PREVIOUS maxElapsedAverage:" << _maxElapsedAverage
100  << "NEW maxElapsedAverage:" << elapsedMovingAverage << "** NEW MAX ELAPSED AVERAGE **"
101  << "samples:" << _movingAverage.getSamples();
102 #endif
103  _maxElapsedAverage = elapsedMovingAverage;
104  }
105  if (lastHeartbeatAge > _maxElapsed) {
106 #if !defined(NDEBUG)
107  qCDebug(interfaceapp_deadlock) << "DEADLOCK WATCHDOG WARNING:"
108  << "lastHeartbeatAge:" << lastHeartbeatAge
109  << "elapsedMovingAverage:" << elapsedMovingAverage
110  << "PREVIOUS maxElapsed:" << _maxElapsed
111  << "NEW maxElapsed:" << lastHeartbeatAge << "** NEW MAX ELAPSED **"
112  << "maxElapsedAverage:" << _maxElapsedAverage
113  << "samples:" << _movingAverage.getSamples();
114 #endif
115  _maxElapsed = lastHeartbeatAge;
116  }
117 
118 #if !defined(NDEBUG)
119  if (elapsedMovingAverage > WARNING_ELAPSED_HEARTBEAT) {
120  qCDebug(interfaceapp_deadlock) << "DEADLOCK WATCHDOG WARNING:"
121  << "lastHeartbeatAge:" << lastHeartbeatAge
122  << "elapsedMovingAverage:" << elapsedMovingAverage << "** OVER EXPECTED VALUE **"
123  << "maxElapsed:" << _maxElapsed
124  << "maxElapsedAverage:" << _maxElapsedAverage
125  << "samples:" << _movingAverage.getSamples();
126  }
127 #endif
128 
129  if (lastHeartbeatAge > MAX_HEARTBEAT_AGE_USECS) {
130  qCDebug(interfaceapp_deadlock) << "DEADLOCK DETECTED -- "
131  << "lastHeartbeatAge:" << lastHeartbeatAge
132  << "[ lastHeartbeat :" << lastHeartbeat
133  << "now:" << now << " ]"
134  << "elapsedMovingAverage:" << elapsedMovingAverage
135  << "maxElapsed:" << _maxElapsed
136  << "maxElapsedAverage:" << _maxElapsedAverage
137  << "samples:" << _movingAverage.getSamples();
138 
139  // Don't actually crash in debug builds, in case this apparent deadlock is simply from
140  // the developer actively debugging code
141  #ifdef NDEBUG
142  deadlockDetectionCrash();
143  #endif
144  }
145  }
146  }
147 
148  static std::atomic<bool> _paused;
149  static std::atomic<uint64_t> _heartbeat;
150  static std::atomic<uint64_t> _maxElapsed;
151  static std::atomic<int> _maxElapsedAverage;
152  static ThreadSafeMovingAverage<int, HEARTBEAT_SAMPLES> _movingAverage;
153 
154  bool _quit { false };
155 
156  Qt::HANDLE _mainThreadID = nullptr;
157 };
158 
159 #endif // hifi_DeadlockWatchdog_h