1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server; 18 19 import android.app.IActivityController; 20 import android.content.BroadcastReceiver; 21 import android.content.Context; 22 import android.content.Intent; 23 import android.content.IntentFilter; 24 import android.hidl.manager.V1_0.IServiceManager; 25 import android.os.Binder; 26 import android.os.Build; 27 import android.os.Debug; 28 import android.os.FileUtils; 29 import android.os.Handler; 30 import android.os.IPowerManager; 31 import android.os.Looper; 32 import android.os.Process; 33 import android.os.RemoteException; 34 import android.os.ServiceDebugInfo; 35 import android.os.ServiceManager; 36 import android.os.SystemClock; 37 import android.os.SystemProperties; 38 import android.sysprop.WatchdogProperties; 39 import android.util.EventLog; 40 import android.util.Log; 41 import android.util.Slog; 42 import android.util.SparseArray; 43 44 import com.android.internal.os.ProcessCpuTracker; 45 import com.android.internal.os.ZygoteConnectionConstants; 46 import com.android.internal.util.FrameworkStatsLog; 47 import com.android.server.am.ActivityManagerService; 48 import com.android.server.am.TraceErrorLogger; 49 import com.android.server.wm.SurfaceAnimationThread; 50 51 import java.io.BufferedReader; 52 import java.io.File; 53 import java.io.FileNotFoundException; 54 import java.io.FileReader; 55 import java.io.FileWriter; 56 import java.io.IOException; 57 import java.io.StringWriter; 58 import java.util.ArrayList; 59 import java.util.Arrays; 60 import java.util.Collections; 61 import java.util.HashSet; 62 import java.util.List; 63 import java.util.UUID; 64 import java.util.concurrent.TimeUnit; 65 66 /** This class calls its monitor every minute. Killing this process if they don't return **/ 67 public class Watchdog { 68 static final String TAG = "Watchdog"; 69 70 /** Debug flag. */ 71 public static final boolean DEBUG = false; 72 73 // Set this to true to use debug default values. 74 private static final boolean DB = false; 75 76 // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with 77 // timeout in com.android.internal.os.ZygoteConnection, or wrapped applications 78 // can trigger the watchdog. 79 // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped 80 // applications may not work with a debug build. CTS will fail. 81 private static final long DEFAULT_TIMEOUT = 82 (DB ? 10 * 1000 : 60 * 1000) * Build.HW_TIMEOUT_MULTIPLIER; 83 private static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2; 84 85 // These are temporally ordered: larger values as lateness increases 86 private static final int COMPLETED = 0; 87 private static final int WAITING = 1; 88 private static final int WAITED_HALF = 2; 89 private static final int OVERDUE = 3; 90 91 // Track watchdog timeout history and break the crash loop if there is. 92 private static final String TIMEOUT_HISTORY_FILE = "/data/system/watchdog-timeout-history.txt"; 93 private static final String PROP_FATAL_LOOP_COUNT = "framework_watchdog.fatal_count"; 94 private static final String PROP_FATAL_LOOP_WINDOWS_SECS = 95 "framework_watchdog.fatal_window.second"; 96 97 // Which native processes to dump into dropbox's stack traces 98 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 99 "/system/bin/audioserver", 100 "/system/bin/cameraserver", 101 "/system/bin/drmserver", 102 "/system/bin/keystore2", 103 "/system/bin/mediadrmserver", 104 "/system/bin/mediaserver", 105 "/system/bin/netd", 106 "/system/bin/sdcard", 107 "/system/bin/surfaceflinger", 108 "/system/bin/vold", 109 "media.extractor", // system/bin/mediaextractor 110 "media.metrics", // system/bin/mediametrics 111 "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service 112 "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec 113 "media.transcoding", // Media transcoding service 114 "com.android.bluetooth", // Bluetooth service 115 "/apex/com.android.os.statsd/bin/statsd", // Stats daemon 116 }; 117 118 public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList( 119 "android.hardware.audio@4.0::IDevicesFactory", 120 "android.hardware.audio@5.0::IDevicesFactory", 121 "android.hardware.audio@6.0::IDevicesFactory", 122 "android.hardware.audio@7.0::IDevicesFactory", 123 "android.hardware.biometrics.face@1.0::IBiometricsFace", 124 "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint", 125 "android.hardware.bluetooth@1.0::IBluetoothHci", 126 "android.hardware.camera.provider@2.4::ICameraProvider", 127 "android.hardware.gnss@1.0::IGnss", 128 "android.hardware.graphics.allocator@2.0::IAllocator", 129 "android.hardware.graphics.composer@2.1::IComposer", 130 "android.hardware.health@2.0::IHealth", 131 "android.hardware.light@2.0::ILight", 132 "android.hardware.media.c2@1.0::IComponentStore", 133 "android.hardware.media.omx@1.0::IOmx", 134 "android.hardware.media.omx@1.0::IOmxStore", 135 "android.hardware.neuralnetworks@1.0::IDevice", 136 "android.hardware.power.stats@1.0::IPowerStats", 137 "android.hardware.sensors@1.0::ISensors", 138 "android.hardware.sensors@2.0::ISensors", 139 "android.hardware.sensors@2.1::ISensors", 140 "android.hardware.vr@1.0::IVr", 141 "android.system.suspend@1.0::ISystemSuspend" 142 ); 143 144 public static final String[] AIDL_INTERFACE_PREFIXES_OF_INTEREST = new String[] { 145 "android.hardware.biometrics.face.IFace/", 146 "android.hardware.biometrics.fingerprint.IFingerprint/", 147 "android.hardware.light.ILights/", 148 "android.hardware.power.stats.IPowerStats/", 149 }; 150 151 private static Watchdog sWatchdog; 152 153 private final Thread mThread; 154 155 private final Object mLock = new Object(); 156 157 /* This handler will be used to post message back onto the main thread */ 158 private final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>(); 159 private final HandlerChecker mMonitorChecker; 160 private ActivityManagerService mActivity; 161 162 private IActivityController mController; 163 private boolean mAllowRestart = true; 164 private final List<Integer> mInterestingJavaPids = new ArrayList<>(); 165 166 private final TraceErrorLogger mTraceErrorLogger; 167 168 /** 169 * Used for checking status of handle threads and scheduling monitor callbacks. 170 */ 171 public final class HandlerChecker implements Runnable { 172 private final Handler mHandler; 173 private final String mName; 174 private final long mWaitMax; 175 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 176 private final ArrayList<Monitor> mMonitorQueue = new ArrayList<Monitor>(); 177 private boolean mCompleted; 178 private Monitor mCurrentMonitor; 179 private long mStartTime; 180 private int mPauseCount; 181 HandlerChecker(Handler handler, String name, long waitMaxMillis)182 HandlerChecker(Handler handler, String name, long waitMaxMillis) { 183 mHandler = handler; 184 mName = name; 185 mWaitMax = waitMaxMillis; 186 mCompleted = true; 187 } 188 addMonitorLocked(Monitor monitor)189 void addMonitorLocked(Monitor monitor) { 190 // We don't want to update mMonitors when the Handler is in the middle of checking 191 // all monitors. We will update mMonitors on the next schedule if it is safe 192 mMonitorQueue.add(monitor); 193 } 194 scheduleCheckLocked()195 public void scheduleCheckLocked() { 196 if (mCompleted) { 197 // Safe to update monitors in queue, Handler is not in the middle of work 198 mMonitors.addAll(mMonitorQueue); 199 mMonitorQueue.clear(); 200 } 201 if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) 202 || (mPauseCount > 0)) { 203 // Don't schedule until after resume OR 204 // If the target looper has recently been polling, then 205 // there is no reason to enqueue our checker on it since that 206 // is as good as it not being deadlocked. This avoid having 207 // to do a context switch to check the thread. Note that we 208 // only do this if we have no monitors since those would need to 209 // be executed at this point. 210 mCompleted = true; 211 return; 212 } 213 if (!mCompleted) { 214 // we already have a check in flight, so no need 215 return; 216 } 217 218 mCompleted = false; 219 mCurrentMonitor = null; 220 mStartTime = SystemClock.uptimeMillis(); 221 mHandler.postAtFrontOfQueue(this); 222 } 223 isOverdueLocked()224 boolean isOverdueLocked() { 225 return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax); 226 } 227 getCompletionStateLocked()228 public int getCompletionStateLocked() { 229 if (mCompleted) { 230 return COMPLETED; 231 } else { 232 long latency = SystemClock.uptimeMillis() - mStartTime; 233 if (latency < mWaitMax/2) { 234 return WAITING; 235 } else if (latency < mWaitMax) { 236 return WAITED_HALF; 237 } 238 } 239 return OVERDUE; 240 } 241 getThread()242 public Thread getThread() { 243 return mHandler.getLooper().getThread(); 244 } 245 getName()246 public String getName() { 247 return mName; 248 } 249 describeBlockedStateLocked()250 String describeBlockedStateLocked() { 251 if (mCurrentMonitor == null) { 252 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")"; 253 } else { 254 return "Blocked in monitor " + mCurrentMonitor.getClass().getName() 255 + " on " + mName + " (" + getThread().getName() + ")"; 256 } 257 } 258 259 @Override run()260 public void run() { 261 // Once we get here, we ensure that mMonitors does not change even if we call 262 // #addMonitorLocked because we first add the new monitors to mMonitorQueue and 263 // move them to mMonitors on the next schedule when mCompleted is true, at which 264 // point we have completed execution of this method. 265 final int size = mMonitors.size(); 266 for (int i = 0 ; i < size ; i++) { 267 synchronized (mLock) { 268 mCurrentMonitor = mMonitors.get(i); 269 } 270 mCurrentMonitor.monitor(); 271 } 272 273 synchronized (mLock) { 274 mCompleted = true; 275 mCurrentMonitor = null; 276 } 277 } 278 279 /** Pause the HandlerChecker. */ pauseLocked(String reason)280 public void pauseLocked(String reason) { 281 mPauseCount++; 282 // Mark as completed, because there's a chance we called this after the watchog 283 // thread loop called Object#wait after 'WAITED_HALF'. In that case we want to ensure 284 // the next call to #getCompletionStateLocked for this checker returns 'COMPLETED' 285 mCompleted = true; 286 Slog.i(TAG, "Pausing HandlerChecker: " + mName + " for reason: " 287 + reason + ". Pause count: " + mPauseCount); 288 } 289 290 /** Resume the HandlerChecker from the last {@link #pauseLocked}. */ resumeLocked(String reason)291 public void resumeLocked(String reason) { 292 if (mPauseCount > 0) { 293 mPauseCount--; 294 Slog.i(TAG, "Resuming HandlerChecker: " + mName + " for reason: " 295 + reason + ". Pause count: " + mPauseCount); 296 } else { 297 Slog.wtf(TAG, "Already resumed HandlerChecker: " + mName); 298 } 299 } 300 } 301 302 final class RebootRequestReceiver extends BroadcastReceiver { 303 @Override onReceive(Context c, Intent intent)304 public void onReceive(Context c, Intent intent) { 305 if (intent.getIntExtra("nowait", 0) != 0) { 306 rebootSystem("Received ACTION_REBOOT broadcast"); 307 return; 308 } 309 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 310 } 311 } 312 313 /** Monitor for checking the availability of binder threads. The monitor will block until 314 * there is a binder thread available to process in coming IPCs to make sure other processes 315 * can still communicate with the service. 316 */ 317 private static final class BinderThreadMonitor implements Watchdog.Monitor { 318 @Override monitor()319 public void monitor() { 320 Binder.blockUntilThreadAvailable(); 321 } 322 } 323 324 public interface Monitor { monitor()325 void monitor(); 326 } 327 getInstance()328 public static Watchdog getInstance() { 329 if (sWatchdog == null) { 330 sWatchdog = new Watchdog(); 331 } 332 333 return sWatchdog; 334 } 335 Watchdog()336 private Watchdog() { 337 mThread = new Thread(this::run, "watchdog"); 338 // Initialize handler checkers for each common thread we want to check. Note 339 // that we are not currently checking the background thread, since it can 340 // potentially hold longer running operations with no guarantees about the timeliness 341 // of operations there. 342 343 // The shared foreground thread is the main checker. It is where we 344 // will also dispatch monitor checks and do other work. 345 mMonitorChecker = new HandlerChecker(FgThread.getHandler(), 346 "foreground thread", DEFAULT_TIMEOUT); 347 mHandlerCheckers.add(mMonitorChecker); 348 // Add checker for main thread. We only do a quick check since there 349 // can be UI running on the thread. 350 mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), 351 "main thread", DEFAULT_TIMEOUT)); 352 // Add checker for shared UI thread. 353 mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), 354 "ui thread", DEFAULT_TIMEOUT)); 355 // And also check IO thread. 356 mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), 357 "i/o thread", DEFAULT_TIMEOUT)); 358 // And the display thread. 359 mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(), 360 "display thread", DEFAULT_TIMEOUT)); 361 // And the animation thread. 362 mHandlerCheckers.add(new HandlerChecker(AnimationThread.getHandler(), 363 "animation thread", DEFAULT_TIMEOUT)); 364 // And the surface animation thread. 365 mHandlerCheckers.add(new HandlerChecker(SurfaceAnimationThread.getHandler(), 366 "surface animation thread", DEFAULT_TIMEOUT)); 367 368 // Initialize monitor for Binder threads. 369 addMonitor(new BinderThreadMonitor()); 370 371 mInterestingJavaPids.add(Process.myPid()); 372 373 // See the notes on DEFAULT_TIMEOUT. 374 assert DB || 375 DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS; 376 377 mTraceErrorLogger = new TraceErrorLogger(); 378 } 379 380 /** 381 * Called by SystemServer to cause the internal thread to begin execution. 382 */ start()383 public void start() { 384 mThread.start(); 385 } 386 387 /** 388 * Registers a {@link BroadcastReceiver} to listen to reboot broadcasts and trigger reboot. 389 * Should be called during boot after the ActivityManagerService is up and registered 390 * as a system service so it can handle registration of a {@link BroadcastReceiver}. 391 */ init(Context context, ActivityManagerService activity)392 public void init(Context context, ActivityManagerService activity) { 393 mActivity = activity; 394 context.registerReceiver(new RebootRequestReceiver(), 395 new IntentFilter(Intent.ACTION_REBOOT), 396 android.Manifest.permission.REBOOT, null); 397 } 398 isInterestingJavaProcess(String processName)399 private static boolean isInterestingJavaProcess(String processName) { 400 return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName) 401 || processName.equals("com.android.phone"); 402 } 403 404 /** 405 * Notifies the watchdog when a Java process with {@code pid} is started. 406 * This process may have its stack trace dumped during an ANR. 407 */ processStarted(String processName, int pid)408 public void processStarted(String processName, int pid) { 409 if (isInterestingJavaProcess(processName)) { 410 Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid); 411 synchronized (mLock) { 412 mInterestingJavaPids.add(pid); 413 } 414 } 415 } 416 417 /** 418 * Notifies the watchdog when a Java process with {@code pid} dies. 419 */ processDied(String processName, int pid)420 public void processDied(String processName, int pid) { 421 if (isInterestingJavaProcess(processName)) { 422 Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid); 423 synchronized (mLock) { 424 mInterestingJavaPids.remove(Integer.valueOf(pid)); 425 } 426 } 427 } 428 setActivityController(IActivityController controller)429 public void setActivityController(IActivityController controller) { 430 synchronized (mLock) { 431 mController = controller; 432 } 433 } 434 setAllowRestart(boolean allowRestart)435 public void setAllowRestart(boolean allowRestart) { 436 synchronized (mLock) { 437 mAllowRestart = allowRestart; 438 } 439 } 440 addMonitor(Monitor monitor)441 public void addMonitor(Monitor monitor) { 442 synchronized (mLock) { 443 mMonitorChecker.addMonitorLocked(monitor); 444 } 445 } 446 addThread(Handler thread)447 public void addThread(Handler thread) { 448 addThread(thread, DEFAULT_TIMEOUT); 449 } 450 addThread(Handler thread, long timeoutMillis)451 public void addThread(Handler thread, long timeoutMillis) { 452 synchronized (mLock) { 453 final String name = thread.getLooper().getThread().getName(); 454 mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis)); 455 } 456 } 457 458 /** 459 * Pauses Watchdog action for the currently running thread. Useful before executing long running 460 * operations that could falsely trigger the watchdog. Each call to this will require a matching 461 * call to {@link #resumeWatchingCurrentThread}. 462 * 463 * <p>If the current thread has not been added to the Watchdog, this call is a no-op. 464 * 465 * <p>If the Watchdog is already paused for the current thread, this call adds 466 * adds another pause and will require an additional {@link #resumeCurrentThread} to resume. 467 * 468 * <p>Note: Use with care, as any deadlocks on the current thread will be undetected until all 469 * pauses have been resumed. 470 */ pauseWatchingCurrentThread(String reason)471 public void pauseWatchingCurrentThread(String reason) { 472 synchronized (mLock) { 473 for (HandlerChecker hc : mHandlerCheckers) { 474 if (Thread.currentThread().equals(hc.getThread())) { 475 hc.pauseLocked(reason); 476 } 477 } 478 } 479 } 480 481 /** 482 * Resumes the last pause from {@link #pauseWatchingCurrentThread} for the currently running 483 * thread. 484 * 485 * <p>If the current thread has not been added to the Watchdog, this call is a no-op. 486 * 487 * <p>If the Watchdog action for the current thread is already resumed, this call logs a wtf. 488 * 489 * <p>If all pauses have been resumed, the Watchdog action is finally resumed, otherwise, 490 * the Watchdog action for the current thread remains paused until resume is called at least 491 * as many times as the calls to pause. 492 */ resumeWatchingCurrentThread(String reason)493 public void resumeWatchingCurrentThread(String reason) { 494 synchronized (mLock) { 495 for (HandlerChecker hc : mHandlerCheckers) { 496 if (Thread.currentThread().equals(hc.getThread())) { 497 hc.resumeLocked(reason); 498 } 499 } 500 } 501 } 502 503 /** 504 * Perform a full reboot of the system. 505 */ rebootSystem(String reason)506 void rebootSystem(String reason) { 507 Slog.i(TAG, "Rebooting system because: " + reason); 508 IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); 509 try { 510 pms.reboot(false, reason, false); 511 } catch (RemoteException ex) { 512 } 513 } 514 evaluateCheckerCompletionLocked()515 private int evaluateCheckerCompletionLocked() { 516 int state = COMPLETED; 517 for (int i=0; i<mHandlerCheckers.size(); i++) { 518 HandlerChecker hc = mHandlerCheckers.get(i); 519 state = Math.max(state, hc.getCompletionStateLocked()); 520 } 521 return state; 522 } 523 getBlockedCheckersLocked()524 private ArrayList<HandlerChecker> getBlockedCheckersLocked() { 525 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 526 for (int i=0; i<mHandlerCheckers.size(); i++) { 527 HandlerChecker hc = mHandlerCheckers.get(i); 528 if (hc.isOverdueLocked()) { 529 checkers.add(hc); 530 } 531 } 532 return checkers; 533 } 534 describeCheckersLocked(List<HandlerChecker> checkers)535 private String describeCheckersLocked(List<HandlerChecker> checkers) { 536 StringBuilder builder = new StringBuilder(128); 537 for (int i=0; i<checkers.size(); i++) { 538 if (builder.length() > 0) { 539 builder.append(", "); 540 } 541 builder.append(checkers.get(i).describeBlockedStateLocked()); 542 } 543 return builder.toString(); 544 } 545 addInterestingHidlPids(HashSet<Integer> pids)546 private static void addInterestingHidlPids(HashSet<Integer> pids) { 547 try { 548 IServiceManager serviceManager = IServiceManager.getService(); 549 ArrayList<IServiceManager.InstanceDebugInfo> dump = 550 serviceManager.debugDump(); 551 for (IServiceManager.InstanceDebugInfo info : dump) { 552 if (info.pid == IServiceManager.PidConstant.NO_PID) { 553 continue; 554 } 555 556 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) { 557 continue; 558 } 559 560 pids.add(info.pid); 561 } 562 } catch (RemoteException e) { 563 Log.w(TAG, e); 564 } 565 } 566 addInterestingAidlPids(HashSet<Integer> pids)567 private static void addInterestingAidlPids(HashSet<Integer> pids) { 568 ServiceDebugInfo[] infos = ServiceManager.getServiceDebugInfo(); 569 if (infos == null) return; 570 571 for (ServiceDebugInfo info : infos) { 572 for (String prefix : AIDL_INTERFACE_PREFIXES_OF_INTEREST) { 573 if (info.name.startsWith(prefix)) { 574 pids.add(info.debugPid); 575 } 576 } 577 } 578 } 579 getInterestingNativePids()580 static ArrayList<Integer> getInterestingNativePids() { 581 HashSet<Integer> pids = new HashSet<>(); 582 addInterestingAidlPids(pids); 583 addInterestingHidlPids(pids); 584 585 int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST); 586 if (nativePids != null) { 587 for (int i : nativePids) { 588 pids.add(i); 589 } 590 } 591 592 return new ArrayList<Integer>(pids); 593 } 594 run()595 private void run() { 596 boolean waitedHalf = false; 597 while (true) { 598 List<HandlerChecker> blockedCheckers = Collections.emptyList(); 599 String subject = ""; 600 boolean allowRestart = true; 601 int debuggerWasConnected = 0; 602 boolean doWaitedHalfDump = false; 603 final ArrayList<Integer> pids; 604 synchronized (mLock) { 605 long timeout = CHECK_INTERVAL; 606 // Make sure we (re)spin the checkers that have become idle within 607 // this wait-and-check interval 608 for (int i=0; i<mHandlerCheckers.size(); i++) { 609 HandlerChecker hc = mHandlerCheckers.get(i); 610 hc.scheduleCheckLocked(); 611 } 612 613 if (debuggerWasConnected > 0) { 614 debuggerWasConnected--; 615 } 616 617 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 618 // wait while asleep. If the device is asleep then the thing that we are waiting 619 // to timeout on is asleep as well and won't have a chance to run, causing a false 620 // positive on when to kill things. 621 long start = SystemClock.uptimeMillis(); 622 while (timeout > 0) { 623 if (Debug.isDebuggerConnected()) { 624 debuggerWasConnected = 2; 625 } 626 try { 627 mLock.wait(timeout); 628 // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting 629 } catch (InterruptedException e) { 630 Log.wtf(TAG, e); 631 } 632 if (Debug.isDebuggerConnected()) { 633 debuggerWasConnected = 2; 634 } 635 timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); 636 } 637 638 final int waitState = evaluateCheckerCompletionLocked(); 639 if (waitState == COMPLETED) { 640 // The monitors have returned; reset 641 waitedHalf = false; 642 continue; 643 } else if (waitState == WAITING) { 644 // still waiting but within their configured intervals; back off and recheck 645 continue; 646 } else if (waitState == WAITED_HALF) { 647 if (!waitedHalf) { 648 Slog.i(TAG, "WAITED_HALF"); 649 waitedHalf = true; 650 // We've waited half, but we'd need to do the stack trace dump w/o the lock. 651 pids = new ArrayList<>(mInterestingJavaPids); 652 doWaitedHalfDump = true; 653 } else { 654 continue; 655 } 656 } else { 657 // something is overdue! 658 blockedCheckers = getBlockedCheckersLocked(); 659 subject = describeCheckersLocked(blockedCheckers); 660 allowRestart = mAllowRestart; 661 pids = new ArrayList<>(mInterestingJavaPids); 662 } 663 } // END synchronized (mLock) 664 665 if (doWaitedHalfDump) { 666 // We've waited half the deadlock-detection interval. Pull a stack 667 // trace and wait another half. 668 ActivityManagerService.dumpStackTraces(pids, null, null, 669 getInterestingNativePids(), null, subject); 670 continue; 671 } 672 673 // If we got here, that means that the system is most likely hung. 674 // First collect stack traces from all threads of the system process. 675 // Then kill this process so that the system will restart. 676 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 677 678 final UUID errorId; 679 if (mTraceErrorLogger.isAddErrorIdEnabled()) { 680 errorId = mTraceErrorLogger.generateErrorId(); 681 mTraceErrorLogger.addErrorIdToTrace("system_server", errorId); 682 } else { 683 errorId = null; 684 } 685 686 // Log the atom as early as possible since it is used as a mechanism to trigger 687 // Perfetto. Ideally, the Perfetto trace capture should happen as close to the 688 // point in time when the Watchdog happens as possible. 689 FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject); 690 691 long anrTime = SystemClock.uptimeMillis(); 692 StringBuilder report = new StringBuilder(); 693 report.append(MemoryPressureUtil.currentPsiState()); 694 ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false); 695 StringWriter tracesFileException = new StringWriter(); 696 final File stack = ActivityManagerService.dumpStackTraces( 697 pids, processCpuTracker, new SparseArray<>(), getInterestingNativePids(), 698 tracesFileException, subject); 699 700 // Give some extra time to make sure the stack traces get written. 701 // The system's been hanging for a minute, another second or two won't hurt much. 702 SystemClock.sleep(5000); 703 704 processCpuTracker.update(); 705 report.append(processCpuTracker.printCurrentState(anrTime)); 706 report.append(tracesFileException.getBuffer()); 707 708 // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log 709 doSysRq('w'); 710 doSysRq('l'); 711 712 // Try to add the error to the dropbox, but assuming that the ActivityManager 713 // itself may be deadlocked. (which has happened, causing this statement to 714 // deadlock and the watchdog as a whole to be ineffective) 715 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 716 public void run() { 717 // If a watched thread hangs before init() is called, we don't have a 718 // valid mActivity. So we can't log the error to dropbox. 719 if (mActivity != null) { 720 mActivity.addErrorToDropBox( 721 "watchdog", null, "system_server", null, null, null, 722 null, report.toString(), stack, null, null, null, 723 errorId); 724 } 725 } 726 }; 727 dropboxThread.start(); 728 try { 729 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 730 } catch (InterruptedException ignored) {} 731 732 IActivityController controller; 733 synchronized (mLock) { 734 controller = mController; 735 } 736 if (controller != null) { 737 Slog.i(TAG, "Reporting stuck state to activity controller"); 738 try { 739 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 740 // 1 = keep waiting, -1 = kill system 741 int res = controller.systemNotResponding(subject); 742 if (res >= 0) { 743 Slog.i(TAG, "Activity controller requested to coninue to wait"); 744 waitedHalf = false; 745 continue; 746 } 747 } catch (RemoteException e) { 748 } 749 } 750 751 // Only kill the process if the debugger is not attached. 752 if (Debug.isDebuggerConnected()) { 753 debuggerWasConnected = 2; 754 } 755 if (debuggerWasConnected >= 2) { 756 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 757 } else if (debuggerWasConnected > 0) { 758 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); 759 } else if (!allowRestart) { 760 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 761 } else { 762 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 763 WatchdogDiagnostics.diagnoseCheckers(blockedCheckers); 764 Slog.w(TAG, "*** GOODBYE!"); 765 if (!Build.IS_USER && isCrashLoopFound() 766 && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) { 767 breakCrashLoop(); 768 } 769 Process.killProcess(Process.myPid()); 770 System.exit(10); 771 } 772 773 waitedHalf = false; 774 } 775 } 776 doSysRq(char c)777 private void doSysRq(char c) { 778 try { 779 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 780 sysrq_trigger.write(c); 781 sysrq_trigger.close(); 782 } catch (IOException e) { 783 Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e); 784 } 785 } 786 resetTimeoutHistory()787 private void resetTimeoutHistory() { 788 writeTimeoutHistory(new ArrayList<String>()); 789 } 790 writeTimeoutHistory(Iterable<String> crashHistory)791 private void writeTimeoutHistory(Iterable<String> crashHistory) { 792 String data = String.join(",", crashHistory); 793 794 try (FileWriter writer = new FileWriter(TIMEOUT_HISTORY_FILE)) { 795 writer.write(SystemProperties.get("ro.boottime.zygote")); 796 writer.write(":"); 797 writer.write(data); 798 } catch (IOException e) { 799 Slog.e(TAG, "Failed to write file " + TIMEOUT_HISTORY_FILE, e); 800 } 801 } 802 readTimeoutHistory()803 private String[] readTimeoutHistory() { 804 final String[] emptyStringArray = {}; 805 806 try (BufferedReader reader = new BufferedReader(new FileReader(TIMEOUT_HISTORY_FILE))) { 807 String line = reader.readLine(); 808 if (line == null) { 809 return emptyStringArray; 810 } 811 812 String[] data = line.trim().split(":"); 813 String boottime = data.length >= 1 ? data[0] : ""; 814 String history = data.length >= 2 ? data[1] : ""; 815 if (SystemProperties.get("ro.boottime.zygote").equals(boottime) && !history.isEmpty()) { 816 return history.split(","); 817 } else { 818 return emptyStringArray; 819 } 820 } catch (FileNotFoundException e) { 821 return emptyStringArray; 822 } catch (IOException e) { 823 Slog.e(TAG, "Failed to read file " + TIMEOUT_HISTORY_FILE, e); 824 return emptyStringArray; 825 } 826 } 827 hasActiveUsbConnection()828 private boolean hasActiveUsbConnection() { 829 try { 830 final String state = FileUtils.readTextFile( 831 new File("/sys/class/android_usb/android0/state"), 832 128 /*max*/, null /*ellipsis*/).trim(); 833 if ("CONFIGURED".equals(state)) { 834 return true; 835 } 836 } catch (IOException e) { 837 Slog.w(TAG, "Failed to determine if device was on USB", e); 838 } 839 return false; 840 } 841 isCrashLoopFound()842 private boolean isCrashLoopFound() { 843 int fatalCount = WatchdogProperties.fatal_count().orElse(0); 844 long fatalWindowMs = TimeUnit.SECONDS.toMillis( 845 WatchdogProperties.fatal_window_seconds().orElse(0)); 846 if (fatalCount == 0 || fatalWindowMs == 0) { 847 if (fatalCount != fatalWindowMs) { 848 Slog.w(TAG, String.format("sysprops '%s' and '%s' should be set or unset together", 849 PROP_FATAL_LOOP_COUNT, PROP_FATAL_LOOP_WINDOWS_SECS)); 850 } 851 return false; 852 } 853 854 // new-history = [last (fatalCount - 1) items in old-history] + [nowMs]. 855 long nowMs = SystemClock.elapsedRealtime(); // Time since boot including deep sleep. 856 String[] rawCrashHistory = readTimeoutHistory(); 857 ArrayList<String> crashHistory = new ArrayList<String>(Arrays.asList(Arrays.copyOfRange( 858 rawCrashHistory, 859 Math.max(0, rawCrashHistory.length - fatalCount - 1), 860 rawCrashHistory.length))); 861 // Something wrong here. 862 crashHistory.add(String.valueOf(nowMs)); 863 writeTimeoutHistory(crashHistory); 864 865 // Returns false if the device has an active USB connection. 866 if (hasActiveUsbConnection()) { 867 return false; 868 } 869 870 long firstCrashMs; 871 try { 872 firstCrashMs = Long.parseLong(crashHistory.get(0)); 873 } catch (NumberFormatException t) { 874 Slog.w(TAG, "Failed to parseLong " + crashHistory.get(0), t); 875 resetTimeoutHistory(); 876 return false; 877 } 878 return crashHistory.size() >= fatalCount && nowMs - firstCrashMs < fatalWindowMs; 879 } 880 breakCrashLoop()881 private void breakCrashLoop() { 882 try (FileWriter kmsg = new FileWriter("/dev/kmsg_debug", /* append= */ true)) { 883 kmsg.append("Fatal reset to escape the system_server crashing loop\n"); 884 } catch (IOException e) { 885 Slog.w(TAG, "Failed to append to kmsg", e); 886 } 887 doSysRq('c'); 888 } 889 } 890