1 /**
2 * Copyright (c) 2020, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "carwatchdogd"
18 #define DEBUG false // STOPSHIP if true.
19
20 #include "WatchdogProcessService.h"
21
22 #include "WatchdogServiceHelper.h"
23
24 #include <android-base/file.h>
25 #include <android-base/macros.h>
26 #include <android-base/properties.h>
27 #include <android-base/stringprintf.h>
28 #include <android-base/strings.h>
29 #include <android/automotive/watchdog/BnCarWatchdogClient.h>
30 #include <android/automotive/watchdog/internal/BnCarWatchdogMonitor.h>
31 #include <android/automotive/watchdog/internal/BnCarWatchdogServiceForSystem.h>
32 #include <android/hardware/automotive/vehicle/2.0/types.h>
33 #include <android/hidl/manager/1.0/IServiceManager.h>
34 #include <binder/IPCThreadState.h>
35 #include <hidl/HidlTransportSupport.h>
36 #include <utils/SystemClock.h>
37
38 #include <utility>
39
40 namespace android {
41 namespace automotive {
42 namespace watchdog {
43
44 namespace aawi = ::android::automotive::watchdog::internal;
45
46 using aawi::BnCarWatchdogServiceForSystem;
47 using aawi::ICarWatchdogServiceForSystem;
48 using ::android::IBinder;
49 using ::android::sp;
50 using ::android::String16;
51 using ::android::base::Error;
52 using ::android::base::GetIntProperty;
53 using ::android::base::GetProperty;
54 using ::android::base::ReadFileToString;
55 using ::android::base::Result;
56 using ::android::base::StringAppendF;
57 using ::android::base::StringPrintf;
58 using ::android::base::Trim;
59 using ::android::base::WriteStringToFd;
60 using ::android::binder::Status;
61 using ::android::hardware::hidl_vec;
62 using ::android::hardware::interfacesEqual;
63 using ::android::hardware::Return;
64 using ::android::hardware::automotive::vehicle::V2_0::IVehicle;
65 using ::android::hardware::automotive::vehicle::V2_0::ProcessTerminationReason;
66 using ::android::hardware::automotive::vehicle::V2_0::StatusCode;
67 using ::android::hardware::automotive::vehicle::V2_0::SubscribeFlags;
68 using ::android::hardware::automotive::vehicle::V2_0::SubscribeOptions;
69 using ::android::hardware::automotive::vehicle::V2_0::VehiclePropConfig;
70 using ::android::hardware::automotive::vehicle::V2_0::VehicleProperty;
71 using ::android::hardware::automotive::vehicle::V2_0::VehiclePropertyStatus;
72 using ::android::hardware::automotive::vehicle::V2_0::VehiclePropValue;
73 using ::android::hidl::base::V1_0::IBase;
74
75 namespace {
76
77 const std::vector<TimeoutLength> kTimeouts = {TimeoutLength::TIMEOUT_CRITICAL,
78 TimeoutLength::TIMEOUT_MODERATE,
79 TimeoutLength::TIMEOUT_NORMAL};
80
81 // TimeoutLength is also used as a message ID. Other message IDs should start next to
82 // TimeoutLength::TIMEOUT_NORMAL.
83 const int32_t MSG_VHAL_WATCHDOG_ALIVE = static_cast<int>(TimeoutLength::TIMEOUT_NORMAL) + 1;
84 const int32_t MSG_VHAL_HEALTH_CHECK = MSG_VHAL_WATCHDOG_ALIVE + 1;
85
86 // VHAL is supposed to send heart beat every 3s. Car watchdog checks if there is the latest heart
87 // beat from VHAL within 3s, allowing 1s marginal time.
88 // If {@code ro.carwatchdog.vhal_healthcheck.interval} is set, car watchdog checks VHAL health at
89 // the given interval. The lower bound of the interval is 3s.
90 constexpr int32_t kDefaultVhalCheckIntervalSec = 3;
91 constexpr std::chrono::milliseconds kHealthCheckDelayMs = 1s;
92
93 constexpr const char kPropertyVhalCheckInterval[] = "ro.carwatchdog.vhal_healthcheck.interval";
94 constexpr const char kServiceName[] = "WatchdogProcessService";
95 constexpr const char kVhalInterfaceName[] = "android.hardware.automotive.vehicle@2.0::IVehicle";
96
timeoutToDurationNs(const TimeoutLength & timeout)97 std::chrono::nanoseconds timeoutToDurationNs(const TimeoutLength& timeout) {
98 switch (timeout) {
99 case TimeoutLength::TIMEOUT_CRITICAL:
100 return 3s; // 3s and no buffer time.
101 case TimeoutLength::TIMEOUT_MODERATE:
102 return 6s; // 5s + 1s as buffer time.
103 case TimeoutLength::TIMEOUT_NORMAL:
104 return 12s; // 10s + 2s as buffer time.
105 }
106 }
107
pidArrayToString(const std::vector<int32_t> & pids)108 std::string pidArrayToString(const std::vector<int32_t>& pids) {
109 size_t size = pids.size();
110 if (size == 0) {
111 return "";
112 }
113 std::string buffer;
114 StringAppendF(&buffer, "%d", pids[0]);
115 for (int i = 1; i < size; i++) {
116 int pid = pids[i];
117 StringAppendF(&buffer, ", %d", pid);
118 }
119 return buffer;
120 }
121
isSystemShuttingDown()122 bool isSystemShuttingDown() {
123 std::string sysPowerCtl;
124 std::istringstream tokenStream(GetProperty("sys.powerctl", ""));
125 std::getline(tokenStream, sysPowerCtl, ',');
126 return sysPowerCtl == "reboot" || sysPowerCtl == "shutdown";
127 }
128
129 } // namespace
130
WatchdogProcessService(const sp<Looper> & handlerLooper)131 WatchdogProcessService::WatchdogProcessService(const sp<Looper>& handlerLooper) :
132 mHandlerLooper(handlerLooper),
133 mIsEnabled(true),
134 mLastSessionId(0),
135 mServiceStarted(false),
136 mVhalService(nullptr) {
137 mMessageHandler = sp<MessageHandlerImpl>::make(this);
138 mBinderDeathRecipient = sp<BinderDeathRecipient>::make(this);
139 mHidlDeathRecipient = sp<HidlDeathRecipient>::make(this);
140 mPropertyChangeListener = sp<PropertyChangeListener>::make(this);
141 for (const auto& timeout : kTimeouts) {
142 mClients.insert(std::make_pair(timeout, std::vector<ClientInfo>()));
143 mPingedClients.insert(std::make_pair(timeout, PingedClientMap()));
144 }
145 int32_t vhalHealthCheckIntervalSec =
146 GetIntProperty(kPropertyVhalCheckInterval, kDefaultVhalCheckIntervalSec);
147 vhalHealthCheckIntervalSec = std::max(vhalHealthCheckIntervalSec, kDefaultVhalCheckIntervalSec);
148 mVhalHealthCheckWindowMs = std::chrono::seconds(vhalHealthCheckIntervalSec);
149 }
registerWatchdogServiceHelper(const sp<IWatchdogServiceHelper> & helper)150 Result<void> WatchdogProcessService::registerWatchdogServiceHelper(
151 const sp<IWatchdogServiceHelper>& helper) {
152 if (helper == nullptr) {
153 return Error() << "Must provide a non-null watchdog service helper instance";
154 }
155 Mutex::Autolock lock(mMutex);
156 mWatchdogServiceHelper = helper;
157 return {};
158 }
159
registerClient(const sp<ICarWatchdogClient> & client,TimeoutLength timeout)160 Status WatchdogProcessService::registerClient(const sp<ICarWatchdogClient>& client,
161 TimeoutLength timeout) {
162 pid_t callingPid = IPCThreadState::self()->getCallingPid();
163 uid_t callingUid = IPCThreadState::self()->getCallingUid();
164 ClientInfo clientInfo(client, callingPid, callingUid);
165
166 Mutex::Autolock lock(mMutex);
167 return registerClientLocked(clientInfo, timeout);
168 }
169
unregisterClient(const sp<ICarWatchdogClient> & client)170 Status WatchdogProcessService::unregisterClient(const sp<ICarWatchdogClient>& client) {
171 Mutex::Autolock lock(mMutex);
172 sp<IBinder> binder = BnCarWatchdogClient::asBinder(client);
173 // kTimeouts is declared as global static constant to cover all kinds of timeout (CRITICAL,
174 // MODERATE, NORMAL).
175 return unregisterClientLocked(kTimeouts, binder, ClientType::Regular);
176 }
177
registerCarWatchdogService(const sp<IBinder> & binder)178 Status WatchdogProcessService::registerCarWatchdogService(const sp<IBinder>& binder) {
179 pid_t callingPid = IPCThreadState::self()->getCallingPid();
180 uid_t callingUid = IPCThreadState::self()->getCallingUid();
181
182 Mutex::Autolock lock(mMutex);
183 if (mWatchdogServiceHelper == nullptr) {
184 return Status::fromExceptionCode(Status::EX_ILLEGAL_STATE,
185 "Watchdog service helper instance is null");
186 }
187 ClientInfo clientInfo(mWatchdogServiceHelper, binder, callingPid, callingUid);
188 return registerClientLocked(clientInfo, TimeoutLength::TIMEOUT_CRITICAL);
189 }
190
unregisterCarWatchdogService(const sp<IBinder> & binder)191 void WatchdogProcessService::unregisterCarWatchdogService(const sp<IBinder>& binder) {
192 Mutex::Autolock lock(mMutex);
193
194 std::vector<TimeoutLength> timeouts = {TimeoutLength::TIMEOUT_CRITICAL};
195 unregisterClientLocked(timeouts, binder, ClientType::Service);
196 }
197
registerMonitor(const sp<aawi::ICarWatchdogMonitor> & monitor)198 Status WatchdogProcessService::registerMonitor(const sp<aawi::ICarWatchdogMonitor>& monitor) {
199 Mutex::Autolock lock(mMutex);
200 sp<IBinder> binder = aawi::BnCarWatchdogMonitor::asBinder(monitor);
201 if (mMonitor != nullptr && binder == aawi::BnCarWatchdogMonitor::asBinder(mMonitor)) {
202 return Status::ok();
203 }
204 status_t ret = binder->linkToDeath(mBinderDeathRecipient);
205 if (ret != OK) {
206 ALOGW("Failed to register the monitor as it is dead.");
207 return Status::fromExceptionCode(Status::EX_ILLEGAL_STATE, "The monitor is dead.");
208 }
209 mMonitor = monitor;
210 if (DEBUG) {
211 ALOGD("Car watchdog monitor is registered");
212 }
213 return Status::ok();
214 }
215
unregisterMonitor(const sp<aawi::ICarWatchdogMonitor> & monitor)216 Status WatchdogProcessService::unregisterMonitor(const sp<aawi::ICarWatchdogMonitor>& monitor) {
217 Mutex::Autolock lock(mMutex);
218 sp<IBinder> curBinder = aawi::BnCarWatchdogMonitor::asBinder(mMonitor);
219 sp<IBinder> newBinder = aawi::BnCarWatchdogMonitor::asBinder(monitor);
220 if (curBinder != newBinder) {
221 ALOGW("Failed to unregister the monitor as it has not been registered.");
222 return Status::fromExceptionCode(Status::EX_ILLEGAL_ARGUMENT,
223 "The monitor has not been registered.");
224 }
225 curBinder->unlinkToDeath(mBinderDeathRecipient);
226 mMonitor = nullptr;
227 if (DEBUG) {
228 ALOGD("Car watchdog monitor is unregistered");
229 }
230 return Status::ok();
231 }
232
tellClientAlive(const sp<ICarWatchdogClient> & client,int32_t sessionId)233 Status WatchdogProcessService::tellClientAlive(const sp<ICarWatchdogClient>& client,
234 int32_t sessionId) {
235 Mutex::Autolock lock(mMutex);
236 return tellClientAliveLocked(BnCarWatchdogClient::asBinder(client), sessionId);
237 }
238
tellCarWatchdogServiceAlive(const sp<ICarWatchdogServiceForSystem> & service,const std::vector<int32_t> & clientsNotResponding,int32_t sessionId)239 Status WatchdogProcessService::tellCarWatchdogServiceAlive(
240 const sp<ICarWatchdogServiceForSystem>& service,
241 const std::vector<int32_t>& clientsNotResponding, int32_t sessionId) {
242 Status status;
243 {
244 Mutex::Autolock lock(mMutex);
245 if (DEBUG) {
246 std::string buffer;
247 int size = clientsNotResponding.size();
248 if (size != 0) {
249 StringAppendF(&buffer, "%d", clientsNotResponding[0]);
250 for (int i = 1; i < clientsNotResponding.size(); i++) {
251 StringAppendF(&buffer, ", %d", clientsNotResponding[i]);
252 }
253 ALOGD("CarWatchdogService(session: %d) responded with non-responding clients: %s",
254 sessionId, buffer.c_str());
255 }
256 }
257 status = tellClientAliveLocked(BnCarWatchdogServiceForSystem::asBinder(service), sessionId);
258 }
259 if (status.isOk()) {
260 dumpAndKillAllProcesses(clientsNotResponding, true);
261 }
262 return status;
263 }
264
tellDumpFinished(const sp<aawi::ICarWatchdogMonitor> & monitor,int32_t pid)265 Status WatchdogProcessService::tellDumpFinished(const sp<aawi::ICarWatchdogMonitor>& monitor,
266 int32_t pid) {
267 Mutex::Autolock lock(mMutex);
268 if (mMonitor == nullptr || monitor == nullptr ||
269 aawi::BnCarWatchdogMonitor::asBinder(monitor) !=
270 aawi::BnCarWatchdogMonitor::asBinder(mMonitor)) {
271 return Status::
272 fromExceptionCode(Status::EX_ILLEGAL_ARGUMENT,
273 "The monitor is not registered or an invalid monitor is given");
274 }
275 ALOGI("Process(pid: %d) has been dumped and killed", pid);
276 return Status::ok();
277 }
278
setEnabled(bool isEnabled)279 void WatchdogProcessService::setEnabled(bool isEnabled) {
280 Mutex::Autolock lock(mMutex);
281 if (mIsEnabled != isEnabled) {
282 ALOGI("%s is %s", kServiceName, isEnabled ? "enabled" : "disabled");
283 }
284 mIsEnabled = isEnabled;
285 if (mIsEnabled) {
286 for (const auto& timeout : kTimeouts) {
287 startHealthCheckingLocked(timeout);
288 }
289 }
290 }
291
notifyUserStateChange(userid_t userId,bool isStarted)292 void WatchdogProcessService::notifyUserStateChange(userid_t userId, bool isStarted) {
293 std::string buffer;
294 Mutex::Autolock lock(mMutex);
295 if (isStarted) {
296 mStoppedUserIds.erase(userId);
297 } else {
298 mStoppedUserIds.insert(userId);
299 }
300 }
301
dump(int fd,const Vector<String16> &)302 Result<void> WatchdogProcessService::dump(int fd, const Vector<String16>& /*args*/) {
303 Mutex::Autolock lock(mMutex);
304 const char* indent = " ";
305 const char* doubleIndent = " ";
306 std::string buffer;
307 WriteStringToFd("CAR WATCHDOG PROCESS SERVICE\n", fd);
308 WriteStringToFd(StringPrintf("%s%s enabled: %s\n", indent, kServiceName,
309 mIsEnabled ? "true" : "false"),
310 fd);
311 WriteStringToFd(StringPrintf("%sRegistered clients\n", indent), fd);
312 int count = 1;
313 for (const auto& timeout : kTimeouts) {
314 std::vector<ClientInfo>& clients = mClients[timeout];
315 for (auto it = clients.begin(); it != clients.end(); it++, count++) {
316 WriteStringToFd(StringPrintf("%sClient #%d: %s\n", doubleIndent, count,
317 it->toString().c_str()),
318 fd);
319 }
320 }
321 WriteStringToFd(StringPrintf("%sMonitor registered: %s\n", indent,
322 mMonitor == nullptr ? "false" : "true"),
323 fd);
324 WriteStringToFd(StringPrintf("%sisSystemShuttingDown: %s\n", indent,
325 isSystemShuttingDown() ? "true" : "false"),
326 fd);
327 buffer = "none";
328 bool first = true;
329 for (const auto& userId : mStoppedUserIds) {
330 if (first) {
331 buffer = StringPrintf("%d", userId);
332 first = false;
333 } else {
334 StringAppendF(&buffer, ", %d", userId);
335 }
336 }
337 WriteStringToFd(StringPrintf("%sStopped users: %s\n", indent, buffer.c_str()), fd);
338 WriteStringToFd(StringPrintf("%sVHAL health check interval: %lldms\n", indent,
339 mVhalHealthCheckWindowMs.count()),
340 fd);
341 return {};
342 }
343
doHealthCheck(int what)344 void WatchdogProcessService::doHealthCheck(int what) {
345 mHandlerLooper->removeMessages(mMessageHandler, what);
346 if (Mutex::Autolock lock(mMutex); !mIsEnabled) {
347 return;
348 }
349 const TimeoutLength timeout = static_cast<TimeoutLength>(what);
350 dumpAndKillClientsIfNotResponding(timeout);
351
352 /* Generates a temporary/local vector containing clients.
353 * Using a local copy may send unnecessary ping messages to clients after they are unregistered.
354 * Clients should be able to handle them.
355 */
356 std::vector<ClientInfo> clientsToCheck;
357 PingedClientMap& pingedClients = mPingedClients[timeout];
358 {
359 Mutex::Autolock lock(mMutex);
360 pingedClients.clear();
361 clientsToCheck = mClients[timeout];
362 for (auto& clientInfo : clientsToCheck) {
363 if (mStoppedUserIds.count(clientInfo.userId) > 0) {
364 continue;
365 }
366 int sessionId = getNewSessionId();
367 clientInfo.sessionId = sessionId;
368 pingedClients.insert(std::make_pair(sessionId, clientInfo));
369 }
370 }
371
372 for (const auto& clientInfo : clientsToCheck) {
373 Status status = clientInfo.checkIfAlive(timeout);
374 if (!status.isOk()) {
375 ALOGW("Sending a ping message to client(pid: %d) failed: %s", clientInfo.pid,
376 status.exceptionMessage().c_str());
377 {
378 Mutex::Autolock lock(mMutex);
379 pingedClients.erase(clientInfo.sessionId);
380 }
381 }
382 }
383 // Though the size of pingedClients is a more specific measure, clientsToCheck is used as a
384 // conservative approach.
385 if (clientsToCheck.size() > 0) {
386 auto durationNs = timeoutToDurationNs(timeout);
387 mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler, Message(what));
388 }
389 }
390
start()391 Result<void> WatchdogProcessService::start() {
392 if (mServiceStarted) {
393 return Error(INVALID_OPERATION) << "Cannot start process monitoring more than once";
394 }
395 mServiceStarted = true;
396 reportWatchdogAliveToVhal();
397 return {};
398 }
399
terminate()400 void WatchdogProcessService::terminate() {
401 Mutex::Autolock lock(mMutex);
402 for (const auto& timeout : kTimeouts) {
403 std::vector<ClientInfo>& clients = mClients[timeout];
404 for (auto it = clients.begin(); it != clients.end();) {
405 it->unlinkToDeath(mBinderDeathRecipient);
406 it = clients.erase(it);
407 }
408 }
409 mWatchdogServiceHelper.clear();
410 if (mMonitor != nullptr) {
411 sp<IBinder> binder = aawi::BnCarWatchdogMonitor::asBinder(mMonitor);
412 binder->unlinkToDeath(mBinderDeathRecipient);
413 }
414 if (mVhalService != nullptr) {
415 mVhalService->unlinkToDeath(mHidlDeathRecipient);
416 }
417 mServiceStarted = false;
418 }
419
registerClientLocked(const ClientInfo & clientInfo,TimeoutLength timeout)420 Status WatchdogProcessService::registerClientLocked(const ClientInfo& clientInfo,
421 TimeoutLength timeout) {
422 if (findClientAndProcessLocked(kTimeouts, clientInfo, nullptr)) {
423 ALOGW("Failed to register (%s) as it is already registered.",
424 clientInfo.toString().c_str());
425 return Status::ok();
426 }
427 status_t status = clientInfo.linkToDeath(mBinderDeathRecipient);
428 if (status != OK) {
429 ALOGW("Failed to register (%s) as it is dead", clientInfo.toString().c_str());
430 std::string errorStr = StringPrintf("(%s) is dead", clientInfo.toString().c_str());
431 return Status::fromExceptionCode(Status::EX_ILLEGAL_STATE, errorStr.c_str());
432 }
433 std::vector<ClientInfo>& clients = mClients[timeout];
434 clients.emplace_back(clientInfo);
435
436 // If the client array becomes non-empty, start health checking.
437 if (clients.size() == 1) {
438 startHealthCheckingLocked(timeout);
439 }
440 if (DEBUG) {
441 ALOGD("Car watchdog client (%s, timeout = %d) is registered", clientInfo.toString().c_str(),
442 timeout);
443 }
444 return Status::ok();
445 }
446
unregisterClientLocked(const std::vector<TimeoutLength> & timeouts,sp<IBinder> binder,ClientType clientType)447 Status WatchdogProcessService::unregisterClientLocked(const std::vector<TimeoutLength>& timeouts,
448 sp<IBinder> binder, ClientType clientType) {
449 const char* clientName = clientType == ClientType::Regular ? "client" : "watchdog service";
450 bool result = findClientAndProcessLocked(timeouts, binder,
451 [&](std::vector<ClientInfo>& clients,
452 std::vector<ClientInfo>::const_iterator it) {
453 it->unlinkToDeath(mBinderDeathRecipient);
454 clients.erase(it);
455 });
456 if (!result) {
457 std::string errorStr = StringPrintf("The %s has not been registered", clientName);
458 const char* errorCause = errorStr.c_str();
459 ALOGW("Failed to unregister the %s: %s", clientName, errorCause);
460 return Status::fromExceptionCode(Status::EX_ILLEGAL_ARGUMENT, errorCause);
461 }
462 if (DEBUG) {
463 ALOGD("Car watchdog %s is unregistered", clientName);
464 }
465 return Status::ok();
466 }
467
tellClientAliveLocked(const sp<IBinder> & binder,int32_t sessionId)468 Status WatchdogProcessService::tellClientAliveLocked(const sp<IBinder>& binder, int32_t sessionId) {
469 for (const auto& timeout : kTimeouts) {
470 PingedClientMap& clients = mPingedClients[timeout];
471 PingedClientMap::const_iterator it = clients.find(sessionId);
472 if (it == clients.cend() || !it->second.matchesBinder(binder)) {
473 continue;
474 }
475 clients.erase(it);
476 return Status::ok();
477 }
478 return Status::fromExceptionCode(Status::EX_ILLEGAL_ARGUMENT,
479 "The client is not registered or the session ID is not found");
480 }
481
findClientAndProcessLocked(const std::vector<TimeoutLength> timeouts,const ClientInfo & clientInfo,const Processor & processor)482 bool WatchdogProcessService::findClientAndProcessLocked(const std::vector<TimeoutLength> timeouts,
483 const ClientInfo& clientInfo,
484 const Processor& processor) {
485 for (const auto& timeout : timeouts) {
486 std::vector<ClientInfo>& clients = mClients[timeout];
487 for (auto it = clients.begin(); it != clients.end(); it++) {
488 if (std::as_const(*it) != clientInfo) {
489 continue;
490 }
491 if (processor != nullptr) {
492 processor(clients, it);
493 }
494 return true;
495 }
496 }
497 return false;
498 }
499
findClientAndProcessLocked(const std::vector<TimeoutLength> timeouts,const sp<IBinder> binder,const Processor & processor)500 bool WatchdogProcessService::findClientAndProcessLocked(const std::vector<TimeoutLength> timeouts,
501 const sp<IBinder> binder,
502 const Processor& processor) {
503 for (const auto& timeout : timeouts) {
504 std::vector<ClientInfo>& clients = mClients[timeout];
505 for (auto it = clients.begin(); it != clients.end(); it++) {
506 if (!it->matchesBinder(binder)) {
507 continue;
508 }
509 if (processor != nullptr) {
510 processor(clients, it);
511 }
512 return true;
513 }
514 }
515 return false;
516 }
517
startHealthCheckingLocked(TimeoutLength timeout)518 Result<void> WatchdogProcessService::startHealthCheckingLocked(TimeoutLength timeout) {
519 PingedClientMap& clients = mPingedClients[timeout];
520 clients.clear();
521 int what = static_cast<int>(timeout);
522 auto durationNs = timeoutToDurationNs(timeout);
523 mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler, Message(what));
524 return {};
525 }
526
dumpAndKillClientsIfNotResponding(TimeoutLength timeout)527 Result<void> WatchdogProcessService::dumpAndKillClientsIfNotResponding(TimeoutLength timeout) {
528 std::vector<int32_t> processIds;
529 std::vector<const ClientInfo*> clientsToNotify;
530 {
531 Mutex::Autolock lock(mMutex);
532 PingedClientMap& clients = mPingedClients[timeout];
533 for (PingedClientMap::const_iterator it = clients.cbegin(); it != clients.cend(); it++) {
534 pid_t pid = -1;
535 userid_t userId = -1;
536 std::vector<TimeoutLength> timeouts = {timeout};
537 findClientAndProcessLocked(timeouts, it->second,
538 [&](std::vector<ClientInfo>& cachedClients,
539 std::vector<ClientInfo>::const_iterator
540 cachedClientsIt) {
541 pid = cachedClientsIt->pid;
542 userId = cachedClientsIt->userId;
543 cachedClients.erase(cachedClientsIt);
544 });
545 if (pid != -1 && mStoppedUserIds.count(userId) == 0) {
546 clientsToNotify.emplace_back(&it->second);
547 processIds.push_back(pid);
548 }
549 }
550 }
551 for (const ClientInfo*& clientInfo : clientsToNotify) {
552 clientInfo->prepareProcessTermination();
553 }
554 return dumpAndKillAllProcesses(processIds, true);
555 }
556
dumpAndKillAllProcesses(const std::vector<int32_t> & processesNotResponding,bool reportToVhal)557 Result<void> WatchdogProcessService::dumpAndKillAllProcesses(
558 const std::vector<int32_t>& processesNotResponding, bool reportToVhal) {
559 size_t size = processesNotResponding.size();
560 if (size == 0) {
561 return {};
562 }
563 std::string pidString = pidArrayToString(processesNotResponding);
564 sp<aawi::ICarWatchdogMonitor> monitor;
565 {
566 Mutex::Autolock lock(mMutex);
567 if (mMonitor == nullptr) {
568 std::string errorMsg =
569 StringPrintf("Failed to dump and kill processes(pid = %s): Monitor is not set",
570 pidString.c_str());
571 ALOGW("%s", errorMsg.c_str());
572 return Error() << errorMsg;
573 }
574 monitor = mMonitor;
575 }
576 if (isSystemShuttingDown()) {
577 ALOGI("Skip dumping and killing processes(%s): The system is shutting down",
578 pidString.c_str());
579 return {};
580 }
581 if (reportToVhal) {
582 reportTerminatedProcessToVhal(processesNotResponding);
583 }
584 monitor->onClientsNotResponding(processesNotResponding);
585 if (DEBUG) {
586 ALOGD("Dumping and killing processes is requested: %s", pidString.c_str());
587 }
588 return {};
589 }
590
591 // Handle when car watchdog clients die.
handleBinderDeath(const wp<IBinder> & who)592 void WatchdogProcessService::handleBinderDeath(const wp<IBinder>& who) {
593 Mutex::Autolock lock(mMutex);
594 IBinder* binder = who.unsafe_get();
595 // Check if dead binder is monitor.
596 sp<IBinder> monitor = aawi::BnCarWatchdogMonitor::asBinder(mMonitor);
597 if (monitor == binder) {
598 mMonitor = nullptr;
599 ALOGW("The monitor has died.");
600 return;
601 }
602 findClientAndProcessLocked(kTimeouts, binder,
603 [&](std::vector<ClientInfo>& clients,
604 std::vector<ClientInfo>::const_iterator it) {
605 ALOGW("Client(pid: %d) died", it->pid);
606 clients.erase(it);
607 });
608 }
609
610 // Handle when VHAL dies.
handleHidlDeath(const wp<IBase> & who)611 void WatchdogProcessService::handleHidlDeath(const wp<IBase>& who) {
612 Mutex::Autolock lock(mMutex);
613 if (!interfacesEqual(mVhalService, who.promote())) {
614 return;
615 }
616 ALOGW("VHAL has died.");
617 mVhalService->unlinkToDeath(mHidlDeathRecipient);
618 mVhalService = nullptr;
619 }
620
reportWatchdogAliveToVhal()621 void WatchdogProcessService::reportWatchdogAliveToVhal() {
622 if (mNotSupportedVhalProperties.count(VehicleProperty::WATCHDOG_ALIVE) > 0) {
623 ALOGW("VHAL doesn't support WATCHDOG_ALIVE. Car watchdog will not update WATCHDOG_ALIVE.");
624 return;
625 }
626 int64_t systemUptime = uptimeMillis();
627 VehiclePropValue propValue{
628 .prop = static_cast<int32_t>(VehicleProperty::WATCHDOG_ALIVE),
629 .status = VehiclePropertyStatus::AVAILABLE,
630 .value = {.int64Values = {systemUptime}},
631 };
632 const auto& ret = updateVhal(propValue);
633 if (!ret.ok()) {
634 ALOGW("Failed to update WATCHDOG_ALIVE VHAL property. Will try again in 3s");
635 }
636 // Update VHAL with the interval of TIMEOUT_CRITICAL(3s).
637 auto durationNs = timeoutToDurationNs(TimeoutLength::TIMEOUT_CRITICAL);
638 mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_WATCHDOG_ALIVE);
639 mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler,
640 Message(MSG_VHAL_WATCHDOG_ALIVE));
641 }
642
reportTerminatedProcessToVhal(const std::vector<int32_t> & processesNotResponding)643 void WatchdogProcessService::reportTerminatedProcessToVhal(
644 const std::vector<int32_t>& processesNotResponding) {
645 if (mNotSupportedVhalProperties.count(VehicleProperty::WATCHDOG_TERMINATED_PROCESS) > 0) {
646 ALOGW("VHAL doesn't support WATCHDOG_TERMINATED_PROCESS. Terminated process is not "
647 "reported to VHAL.");
648 return;
649 }
650 for (auto&& pid : processesNotResponding) {
651 const auto& retCmdLine = readProcCmdLine(pid);
652 if (!retCmdLine.ok()) {
653 ALOGW("Failed to get process command line for pid(%d): %s", pid,
654 retCmdLine.error().message().c_str());
655 continue;
656 }
657 std::string procCmdLine = retCmdLine.value();
658 VehiclePropValue propValue{
659 .prop = static_cast<int32_t>(VehicleProperty::WATCHDOG_TERMINATED_PROCESS),
660 .status = VehiclePropertyStatus::AVAILABLE,
661 .value = {
662 .int32Values = {static_cast<int32_t>(
663 ProcessTerminationReason::NOT_RESPONDING)},
664 .stringValue = procCmdLine,
665 },
666 };
667 const auto& retUpdate = updateVhal(propValue);
668 if (!retUpdate.ok()) {
669 ALOGW("Failed to update WATCHDOG_TERMINATED_PROCESS VHAL property(command line: %s)",
670 procCmdLine.c_str());
671 }
672 }
673 }
674
updateVhal(const VehiclePropValue & value)675 Result<void> WatchdogProcessService::updateVhal(const VehiclePropValue& value) {
676 Mutex::Autolock lock(mMutex);
677 const auto& connectRet = connectToVhalLocked();
678 if (!connectRet.ok()) {
679 std::string errorMsg = "VHAL is not connected: " + connectRet.error().message();
680 ALOGW("%s", errorMsg.c_str());
681 return Error() << errorMsg;
682 }
683 if (mNotSupportedVhalProperties.count(static_cast<VehicleProperty>(value.prop)) > 0) {
684 std::string errorMsg = StringPrintf("VHAL doesn't support property(id: %d)", value.prop);
685 ALOGW("%s", errorMsg.c_str());
686 return Error() << errorMsg;
687 }
688 const auto& updateRet = mVhalService->set(value);
689 if (updateRet.isOk() && updateRet == StatusCode::OK) {
690 return {};
691 }
692 return Error() << "Failed to set propValue(" << value.prop << ") to VHAL";
693 }
694
readProcCmdLine(int32_t pid)695 Result<std::string> WatchdogProcessService::readProcCmdLine(int32_t pid) {
696 std::string cmdLinePath = StringPrintf("/proc/%d/cmdline", pid);
697 std::string procCmdLine;
698 if (ReadFileToString(cmdLinePath, &procCmdLine)) {
699 std::replace(procCmdLine.begin(), procCmdLine.end(), '\0', ' ');
700 procCmdLine = Trim(procCmdLine);
701 return procCmdLine;
702 }
703 return Error() << "Failed to read " << cmdLinePath;
704 }
705
connectToVhalLocked()706 Result<void> WatchdogProcessService::connectToVhalLocked() {
707 if (mVhalService.get() != nullptr) {
708 return {};
709 }
710 mVhalService = IVehicle::tryGetService();
711 if (mVhalService.get() == nullptr) {
712 return Error() << "Failed to connect to VHAL.";
713 }
714 mVhalService->linkToDeath(mHidlDeathRecipient, /*cookie=*/0);
715 queryVhalPropertiesLocked();
716 subscribeToVhalHeartBeatLocked();
717 ALOGI("Successfully connected to VHAL.");
718 return {};
719 }
720
queryVhalPropertiesLocked()721 void WatchdogProcessService::queryVhalPropertiesLocked() {
722 mNotSupportedVhalProperties.clear();
723 std::vector<VehicleProperty> propIds = {VehicleProperty::WATCHDOG_ALIVE,
724 VehicleProperty::WATCHDOG_TERMINATED_PROCESS,
725 VehicleProperty::VHAL_HEARTBEAT};
726 for (const auto& propId : propIds) {
727 if (!isVhalPropertySupportedLocked(propId)) {
728 mNotSupportedVhalProperties.insert(propId);
729 }
730 }
731 }
732
isVhalPropertySupportedLocked(VehicleProperty propId)733 bool WatchdogProcessService::isVhalPropertySupportedLocked(VehicleProperty propId) {
734 StatusCode status;
735 hidl_vec<int32_t> props = {static_cast<int32_t>(propId)};
736 mVhalService->getPropConfigs(props,
737 [&status](StatusCode s,
738 hidl_vec<VehiclePropConfig> /*propConfigs*/) {
739 status = s;
740 });
741 return status == StatusCode::OK;
742 }
743
subscribeToVhalHeartBeatLocked()744 void WatchdogProcessService::subscribeToVhalHeartBeatLocked() {
745 if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) > 0) {
746 ALOGW("VHAL doesn't support VHAL_HEARTBEAT. Checking VHAL health is disabled.");
747 return;
748 }
749
750 mVhalHeartBeat = {
751 .eventTime = 0,
752 .value = 0,
753 };
754
755 SubscribeOptions reqVhalProperties[] = {
756 {.propId = static_cast<int32_t>(VehicleProperty::VHAL_HEARTBEAT),
757 .flags = SubscribeFlags::EVENTS_FROM_CAR},
758 };
759 hidl_vec<SubscribeOptions> options;
760 options.setToExternal(reqVhalProperties, arraysize(reqVhalProperties));
761 StatusCode status = mVhalService->subscribe(mPropertyChangeListener, options);
762 if (status != StatusCode::OK) {
763 ALOGW("Failed to subscribe to VHAL_HEARTBEAT. Checking VHAL health is disabled.");
764 return;
765 }
766 std::chrono::nanoseconds intervalNs = mVhalHealthCheckWindowMs + kHealthCheckDelayMs;
767 mHandlerLooper->sendMessageDelayed(intervalNs.count(), mMessageHandler,
768 Message(MSG_VHAL_HEALTH_CHECK));
769 }
770
getNewSessionId()771 int32_t WatchdogProcessService::getNewSessionId() {
772 // Make sure that session id is always positive number.
773 if (++mLastSessionId <= 0) {
774 mLastSessionId = 1;
775 }
776 return mLastSessionId;
777 }
778
updateVhalHeartBeat(int64_t value)779 void WatchdogProcessService::updateVhalHeartBeat(int64_t value) {
780 bool wrongHeartBeat;
781 {
782 Mutex::Autolock lock(mMutex);
783 wrongHeartBeat = value <= mVhalHeartBeat.value;
784 mVhalHeartBeat.eventTime = uptimeMillis();
785 mVhalHeartBeat.value = value;
786 }
787 if (wrongHeartBeat) {
788 ALOGW("VHAL updated heart beat with a wrong value. Terminating VHAL...");
789 terminateVhal();
790 return;
791 }
792 std::chrono::nanoseconds intervalNs = mVhalHealthCheckWindowMs + kHealthCheckDelayMs;
793 mHandlerLooper->sendMessageDelayed(intervalNs.count(), mMessageHandler,
794 Message(MSG_VHAL_HEALTH_CHECK));
795 }
796
checkVhalHealth()797 void WatchdogProcessService::checkVhalHealth() {
798 int64_t lastEventTime;
799 int64_t currentUptime = uptimeMillis();
800 {
801 Mutex::Autolock lock(mMutex);
802 lastEventTime = mVhalHeartBeat.eventTime;
803 }
804 if (currentUptime > lastEventTime + mVhalHealthCheckWindowMs.count()) {
805 ALOGW("VHAL failed to update heart beat within timeout. Terminating VHAL...");
806 terminateVhal();
807 }
808 }
809
terminateVhal()810 void WatchdogProcessService::terminateVhal() {
811 using ::android::hidl::manager::V1_0::IServiceManager;
812
813 std::vector<int32_t> processIds;
814 sp<IServiceManager> manager = IServiceManager::getService();
815 Return<void> ret = manager->debugDump([&](auto& hals) {
816 for (const auto& info : hals) {
817 if (info.pid == static_cast<int>(IServiceManager::PidConstant::NO_PID)) {
818 continue;
819 }
820 if (info.interfaceName == kVhalInterfaceName) {
821 processIds.push_back(info.pid);
822 break;
823 }
824 }
825 });
826
827 if (!ret.isOk()) {
828 ALOGE("Failed to terminate VHAL: could not get VHAL process id");
829 return;
830 } else if (processIds.empty()) {
831 ALOGE("Failed to terminate VHAL: VHAL is not running");
832 return;
833 }
834 dumpAndKillAllProcesses(processIds, false);
835 }
836
toString() const837 std::string WatchdogProcessService::ClientInfo::toString() const {
838 std::string buffer;
839 StringAppendF(&buffer, "pid = %d, userId = %d, type = %s", pid, userId,
840 type == ClientType::Regular ? "regular" : "watchdog service");
841 return buffer;
842 }
843
getBinder() const844 sp<IBinder> WatchdogProcessService::ClientInfo::getBinder() const {
845 if (type == ClientType::Regular) {
846 return BnCarWatchdogClient::asBinder(client);
847 }
848 return watchdogServiceBinder;
849 }
850
linkToDeath(const sp<IBinder::DeathRecipient> & recipient) const851 status_t WatchdogProcessService::ClientInfo::linkToDeath(
852 const sp<IBinder::DeathRecipient>& recipient) const {
853 if (type == ClientType::Regular) {
854 return BnCarWatchdogClient::asBinder(client)->linkToDeath(recipient);
855 }
856 // WatchdogServiceHelper is the binder death recipient for watchdog service, ergo
857 // skip this step.
858 return OK;
859 }
860
unlinkToDeath(const wp<IBinder::DeathRecipient> & recipient) const861 status_t WatchdogProcessService::ClientInfo::unlinkToDeath(
862 const wp<IBinder::DeathRecipient>& recipient) const {
863 if (type == ClientType::Regular) {
864 return BnCarWatchdogClient::asBinder(client)->unlinkToDeath(recipient);
865 }
866 // WatchdogServiceHelper is the binder death recipient for watchdog service, ergo
867 // skip this step.
868 return OK;
869 }
870
checkIfAlive(TimeoutLength timeout) const871 Status WatchdogProcessService::ClientInfo::checkIfAlive(TimeoutLength timeout) const {
872 if (type == ClientType::Regular) {
873 return client->checkIfAlive(sessionId, timeout);
874 }
875 return watchdogServiceHelper->checkIfAlive(watchdogServiceBinder, sessionId, timeout);
876 }
877
prepareProcessTermination() const878 Status WatchdogProcessService::ClientInfo::prepareProcessTermination() const {
879 if (type == ClientType::Regular) {
880 return client->prepareProcessTermination();
881 }
882 return watchdogServiceHelper->prepareProcessTermination(watchdogServiceBinder);
883 }
884
BinderDeathRecipient(const sp<WatchdogProcessService> & service)885 WatchdogProcessService::BinderDeathRecipient::BinderDeathRecipient(
886 const sp<WatchdogProcessService>& service) :
887 mService(service) {}
888
binderDied(const wp<IBinder> & who)889 void WatchdogProcessService::BinderDeathRecipient::binderDied(const wp<IBinder>& who) {
890 mService->handleBinderDeath(who);
891 }
892
HidlDeathRecipient(const sp<WatchdogProcessService> & service)893 WatchdogProcessService::HidlDeathRecipient::HidlDeathRecipient(
894 const sp<WatchdogProcessService>& service) :
895 mService(service) {}
896
serviceDied(uint64_t,const wp<IBase> & who)897 void WatchdogProcessService::HidlDeathRecipient::serviceDied(uint64_t /*cookie*/,
898 const wp<IBase>& who) {
899 mService->handleHidlDeath(who);
900 }
901
PropertyChangeListener(const sp<WatchdogProcessService> & service)902 WatchdogProcessService::PropertyChangeListener::PropertyChangeListener(
903 const sp<WatchdogProcessService>& service) :
904 mService(service) {}
905
onPropertyEvent(const hidl_vec<VehiclePropValue> & propValues)906 Return<void> WatchdogProcessService::PropertyChangeListener::onPropertyEvent(
907 const hidl_vec<VehiclePropValue>& propValues) {
908 for (const auto& value : propValues) {
909 if (value.prop == static_cast<int32_t>(VehicleProperty::VHAL_HEARTBEAT)) {
910 mService->updateVhalHeartBeat(value.value.int64Values[0]);
911 break;
912 }
913 }
914 return Return<void>();
915 }
916
onPropertySet(const VehiclePropValue &)917 Return<void> WatchdogProcessService::PropertyChangeListener::onPropertySet(
918 const VehiclePropValue& /*propValue*/) {
919 return Return<void>();
920 }
921
onPropertySetError(StatusCode,int32_t,int32_t)922 Return<void> WatchdogProcessService::PropertyChangeListener::onPropertySetError(
923 StatusCode /*status*/, int32_t /*propId*/, int32_t /*areaId*/) {
924 return Return<void>();
925 }
926
MessageHandlerImpl(const sp<WatchdogProcessService> & service)927 WatchdogProcessService::MessageHandlerImpl::MessageHandlerImpl(
928 const sp<WatchdogProcessService>& service) :
929 mService(service) {}
930
handleMessage(const Message & message)931 void WatchdogProcessService::MessageHandlerImpl::handleMessage(const Message& message) {
932 switch (message.what) {
933 case static_cast<int>(TimeoutLength::TIMEOUT_CRITICAL):
934 case static_cast<int>(TimeoutLength::TIMEOUT_MODERATE):
935 case static_cast<int>(TimeoutLength::TIMEOUT_NORMAL):
936 mService->doHealthCheck(message.what);
937 break;
938 case MSG_VHAL_WATCHDOG_ALIVE:
939 mService->reportWatchdogAliveToVhal();
940 break;
941 case MSG_VHAL_HEALTH_CHECK:
942 mService->checkVhalHealth();
943 break;
944 default:
945 ALOGW("Unknown message: %d", message.what);
946 }
947 }
948
949 } // namespace watchdog
950 } // namespace automotive
951 } // namespace android
952