1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import android.app.IActivityController;
20 import android.content.BroadcastReceiver;
21 import android.content.Context;
22 import android.content.Intent;
23 import android.content.IntentFilter;
24 import android.hidl.manager.V1_0.IServiceManager;
25 import android.os.Binder;
26 import android.os.Build;
27 import android.os.Debug;
28 import android.os.FileUtils;
29 import android.os.Handler;
30 import android.os.IPowerManager;
31 import android.os.Looper;
32 import android.os.Process;
33 import android.os.RemoteException;
34 import android.os.ServiceDebugInfo;
35 import android.os.ServiceManager;
36 import android.os.SystemClock;
37 import android.os.SystemProperties;
38 import android.sysprop.WatchdogProperties;
39 import android.util.EventLog;
40 import android.util.Log;
41 import android.util.Slog;
42 import android.util.SparseArray;
43 
44 import com.android.internal.os.ProcessCpuTracker;
45 import com.android.internal.os.ZygoteConnectionConstants;
46 import com.android.internal.util.FrameworkStatsLog;
47 import com.android.server.am.ActivityManagerService;
48 import com.android.server.am.TraceErrorLogger;
49 import com.android.server.wm.SurfaceAnimationThread;
50 
51 import java.io.BufferedReader;
52 import java.io.File;
53 import java.io.FileNotFoundException;
54 import java.io.FileReader;
55 import java.io.FileWriter;
56 import java.io.IOException;
57 import java.io.StringWriter;
58 import java.util.ArrayList;
59 import java.util.Arrays;
60 import java.util.Collections;
61 import java.util.HashSet;
62 import java.util.List;
63 import java.util.UUID;
64 import java.util.concurrent.TimeUnit;
65 
66 /** This class calls its monitor every minute. Killing this process if they don't return **/
67 public class Watchdog {
68     static final String TAG = "Watchdog";
69 
70     /** Debug flag. */
71     public static final boolean DEBUG = false;
72 
73     // Set this to true to use debug default values.
74     private static final boolean DB = false;
75 
76     // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with
77     //         timeout in com.android.internal.os.ZygoteConnection, or wrapped applications
78     //         can trigger the watchdog.
79     // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped
80     //         applications may not work with a debug build. CTS will fail.
81     private static final long DEFAULT_TIMEOUT =
82             (DB ? 10 * 1000 : 60 * 1000) * Build.HW_TIMEOUT_MULTIPLIER;
83     private static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
84 
85     // These are temporally ordered: larger values as lateness increases
86     private static final int COMPLETED = 0;
87     private static final int WAITING = 1;
88     private static final int WAITED_HALF = 2;
89     private static final int OVERDUE = 3;
90 
91     // Track watchdog timeout history and break the crash loop if there is.
92     private static final String TIMEOUT_HISTORY_FILE = "/data/system/watchdog-timeout-history.txt";
93     private static final String PROP_FATAL_LOOP_COUNT = "framework_watchdog.fatal_count";
94     private static final String PROP_FATAL_LOOP_WINDOWS_SECS =
95             "framework_watchdog.fatal_window.second";
96 
97     // Which native processes to dump into dropbox's stack traces
98     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
99         "/system/bin/audioserver",
100         "/system/bin/cameraserver",
101         "/system/bin/drmserver",
102         "/system/bin/keystore2",
103         "/system/bin/mediadrmserver",
104         "/system/bin/mediaserver",
105         "/system/bin/netd",
106         "/system/bin/sdcard",
107         "/system/bin/surfaceflinger",
108         "/system/bin/vold",
109         "media.extractor", // system/bin/mediaextractor
110         "media.metrics", // system/bin/mediametrics
111         "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
112         "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec
113         "media.transcoding", // Media transcoding service
114         "com.android.bluetooth",  // Bluetooth service
115         "/apex/com.android.os.statsd/bin/statsd",  // Stats daemon
116     };
117 
118     public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList(
119             "android.hardware.audio@4.0::IDevicesFactory",
120             "android.hardware.audio@5.0::IDevicesFactory",
121             "android.hardware.audio@6.0::IDevicesFactory",
122             "android.hardware.audio@7.0::IDevicesFactory",
123             "android.hardware.biometrics.face@1.0::IBiometricsFace",
124             "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint",
125             "android.hardware.bluetooth@1.0::IBluetoothHci",
126             "android.hardware.camera.provider@2.4::ICameraProvider",
127             "android.hardware.gnss@1.0::IGnss",
128             "android.hardware.graphics.allocator@2.0::IAllocator",
129             "android.hardware.graphics.composer@2.1::IComposer",
130             "android.hardware.health@2.0::IHealth",
131             "android.hardware.light@2.0::ILight",
132             "android.hardware.media.c2@1.0::IComponentStore",
133             "android.hardware.media.omx@1.0::IOmx",
134             "android.hardware.media.omx@1.0::IOmxStore",
135             "android.hardware.neuralnetworks@1.0::IDevice",
136             "android.hardware.power.stats@1.0::IPowerStats",
137             "android.hardware.sensors@1.0::ISensors",
138             "android.hardware.sensors@2.0::ISensors",
139             "android.hardware.sensors@2.1::ISensors",
140             "android.hardware.vr@1.0::IVr",
141             "android.system.suspend@1.0::ISystemSuspend"
142     );
143 
144     public static final String[] AIDL_INTERFACE_PREFIXES_OF_INTEREST = new String[] {
145             "android.hardware.biometrics.face.IFace/",
146             "android.hardware.biometrics.fingerprint.IFingerprint/",
147             "android.hardware.light.ILights/",
148             "android.hardware.power.stats.IPowerStats/",
149     };
150 
151     private static Watchdog sWatchdog;
152 
153     private final Thread mThread;
154 
155     private final Object mLock = new Object();
156 
157     /* This handler will be used to post message back onto the main thread */
158     private final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
159     private final HandlerChecker mMonitorChecker;
160     private ActivityManagerService mActivity;
161 
162     private IActivityController mController;
163     private boolean mAllowRestart = true;
164     private final List<Integer> mInterestingJavaPids = new ArrayList<>();
165 
166     private final TraceErrorLogger mTraceErrorLogger;
167 
168     /**
169      * Used for checking status of handle threads and scheduling monitor callbacks.
170      */
171     public final class HandlerChecker implements Runnable {
172         private final Handler mHandler;
173         private final String mName;
174         private final long mWaitMax;
175         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
176         private final ArrayList<Monitor> mMonitorQueue = new ArrayList<Monitor>();
177         private boolean mCompleted;
178         private Monitor mCurrentMonitor;
179         private long mStartTime;
180         private int mPauseCount;
181 
HandlerChecker(Handler handler, String name, long waitMaxMillis)182         HandlerChecker(Handler handler, String name, long waitMaxMillis) {
183             mHandler = handler;
184             mName = name;
185             mWaitMax = waitMaxMillis;
186             mCompleted = true;
187         }
188 
addMonitorLocked(Monitor monitor)189         void addMonitorLocked(Monitor monitor) {
190             // We don't want to update mMonitors when the Handler is in the middle of checking
191             // all monitors. We will update mMonitors on the next schedule if it is safe
192             mMonitorQueue.add(monitor);
193         }
194 
scheduleCheckLocked()195         public void scheduleCheckLocked() {
196             if (mCompleted) {
197                 // Safe to update monitors in queue, Handler is not in the middle of work
198                 mMonitors.addAll(mMonitorQueue);
199                 mMonitorQueue.clear();
200             }
201             if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
202                     || (mPauseCount > 0)) {
203                 // Don't schedule until after resume OR
204                 // If the target looper has recently been polling, then
205                 // there is no reason to enqueue our checker on it since that
206                 // is as good as it not being deadlocked.  This avoid having
207                 // to do a context switch to check the thread. Note that we
208                 // only do this if we have no monitors since those would need to
209                 // be executed at this point.
210                 mCompleted = true;
211                 return;
212             }
213             if (!mCompleted) {
214                 // we already have a check in flight, so no need
215                 return;
216             }
217 
218             mCompleted = false;
219             mCurrentMonitor = null;
220             mStartTime = SystemClock.uptimeMillis();
221             mHandler.postAtFrontOfQueue(this);
222         }
223 
isOverdueLocked()224         boolean isOverdueLocked() {
225             return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
226         }
227 
getCompletionStateLocked()228         public int getCompletionStateLocked() {
229             if (mCompleted) {
230                 return COMPLETED;
231             } else {
232                 long latency = SystemClock.uptimeMillis() - mStartTime;
233                 if (latency < mWaitMax/2) {
234                     return WAITING;
235                 } else if (latency < mWaitMax) {
236                     return WAITED_HALF;
237                 }
238             }
239             return OVERDUE;
240         }
241 
getThread()242         public Thread getThread() {
243             return mHandler.getLooper().getThread();
244         }
245 
getName()246         public String getName() {
247             return mName;
248         }
249 
describeBlockedStateLocked()250         String describeBlockedStateLocked() {
251             if (mCurrentMonitor == null) {
252                 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
253             } else {
254                 return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
255                         + " on " + mName + " (" + getThread().getName() + ")";
256             }
257         }
258 
259         @Override
run()260         public void run() {
261             // Once we get here, we ensure that mMonitors does not change even if we call
262             // #addMonitorLocked because we first add the new monitors to mMonitorQueue and
263             // move them to mMonitors on the next schedule when mCompleted is true, at which
264             // point we have completed execution of this method.
265             final int size = mMonitors.size();
266             for (int i = 0 ; i < size ; i++) {
267                 synchronized (mLock) {
268                     mCurrentMonitor = mMonitors.get(i);
269                 }
270                 mCurrentMonitor.monitor();
271             }
272 
273             synchronized (mLock) {
274                 mCompleted = true;
275                 mCurrentMonitor = null;
276             }
277         }
278 
279         /** Pause the HandlerChecker. */
pauseLocked(String reason)280         public void pauseLocked(String reason) {
281             mPauseCount++;
282             // Mark as completed, because there's a chance we called this after the watchog
283             // thread loop called Object#wait after 'WAITED_HALF'. In that case we want to ensure
284             // the next call to #getCompletionStateLocked for this checker returns 'COMPLETED'
285             mCompleted = true;
286             Slog.i(TAG, "Pausing HandlerChecker: " + mName + " for reason: "
287                     + reason + ". Pause count: " + mPauseCount);
288         }
289 
290         /** Resume the HandlerChecker from the last {@link #pauseLocked}. */
resumeLocked(String reason)291         public void resumeLocked(String reason) {
292             if (mPauseCount > 0) {
293                 mPauseCount--;
294                 Slog.i(TAG, "Resuming HandlerChecker: " + mName + " for reason: "
295                         + reason + ". Pause count: " + mPauseCount);
296             } else {
297                 Slog.wtf(TAG, "Already resumed HandlerChecker: " + mName);
298             }
299         }
300     }
301 
302     final class RebootRequestReceiver extends BroadcastReceiver {
303         @Override
onReceive(Context c, Intent intent)304         public void onReceive(Context c, Intent intent) {
305             if (intent.getIntExtra("nowait", 0) != 0) {
306                 rebootSystem("Received ACTION_REBOOT broadcast");
307                 return;
308             }
309             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
310         }
311     }
312 
313     /** Monitor for checking the availability of binder threads. The monitor will block until
314      * there is a binder thread available to process in coming IPCs to make sure other processes
315      * can still communicate with the service.
316      */
317     private static final class BinderThreadMonitor implements Watchdog.Monitor {
318         @Override
monitor()319         public void monitor() {
320             Binder.blockUntilThreadAvailable();
321         }
322     }
323 
324     public interface Monitor {
monitor()325         void monitor();
326     }
327 
getInstance()328     public static Watchdog getInstance() {
329         if (sWatchdog == null) {
330             sWatchdog = new Watchdog();
331         }
332 
333         return sWatchdog;
334     }
335 
Watchdog()336     private Watchdog() {
337         mThread = new Thread(this::run, "watchdog");
338         // Initialize handler checkers for each common thread we want to check.  Note
339         // that we are not currently checking the background thread, since it can
340         // potentially hold longer running operations with no guarantees about the timeliness
341         // of operations there.
342 
343         // The shared foreground thread is the main checker.  It is where we
344         // will also dispatch monitor checks and do other work.
345         mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
346                 "foreground thread", DEFAULT_TIMEOUT);
347         mHandlerCheckers.add(mMonitorChecker);
348         // Add checker for main thread.  We only do a quick check since there
349         // can be UI running on the thread.
350         mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
351                 "main thread", DEFAULT_TIMEOUT));
352         // Add checker for shared UI thread.
353         mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
354                 "ui thread", DEFAULT_TIMEOUT));
355         // And also check IO thread.
356         mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
357                 "i/o thread", DEFAULT_TIMEOUT));
358         // And the display thread.
359         mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
360                 "display thread", DEFAULT_TIMEOUT));
361         // And the animation thread.
362         mHandlerCheckers.add(new HandlerChecker(AnimationThread.getHandler(),
363                 "animation thread", DEFAULT_TIMEOUT));
364         // And the surface animation thread.
365         mHandlerCheckers.add(new HandlerChecker(SurfaceAnimationThread.getHandler(),
366                 "surface animation thread", DEFAULT_TIMEOUT));
367 
368         // Initialize monitor for Binder threads.
369         addMonitor(new BinderThreadMonitor());
370 
371         mInterestingJavaPids.add(Process.myPid());
372 
373         // See the notes on DEFAULT_TIMEOUT.
374         assert DB ||
375                 DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
376 
377         mTraceErrorLogger = new TraceErrorLogger();
378     }
379 
380     /**
381      * Called by SystemServer to cause the internal thread to begin execution.
382      */
start()383     public void start() {
384         mThread.start();
385     }
386 
387     /**
388      * Registers a {@link BroadcastReceiver} to listen to reboot broadcasts and trigger reboot.
389      * Should be called during boot after the ActivityManagerService is up and registered
390      * as a system service so it can handle registration of a {@link BroadcastReceiver}.
391      */
init(Context context, ActivityManagerService activity)392     public void init(Context context, ActivityManagerService activity) {
393         mActivity = activity;
394         context.registerReceiver(new RebootRequestReceiver(),
395                 new IntentFilter(Intent.ACTION_REBOOT),
396                 android.Manifest.permission.REBOOT, null);
397     }
398 
isInterestingJavaProcess(String processName)399     private static boolean isInterestingJavaProcess(String processName) {
400         return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName)
401                 || processName.equals("com.android.phone");
402     }
403 
404     /**
405      * Notifies the watchdog when a Java process with {@code pid} is started.
406      * This process may have its stack trace dumped during an ANR.
407      */
processStarted(String processName, int pid)408     public void processStarted(String processName, int pid) {
409         if (isInterestingJavaProcess(processName)) {
410             Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid);
411             synchronized (mLock) {
412                 mInterestingJavaPids.add(pid);
413             }
414         }
415     }
416 
417     /**
418      * Notifies the watchdog when a Java process with {@code pid} dies.
419      */
processDied(String processName, int pid)420     public void processDied(String processName, int pid) {
421         if (isInterestingJavaProcess(processName)) {
422             Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid);
423             synchronized (mLock) {
424                 mInterestingJavaPids.remove(Integer.valueOf(pid));
425             }
426         }
427     }
428 
setActivityController(IActivityController controller)429     public void setActivityController(IActivityController controller) {
430         synchronized (mLock) {
431             mController = controller;
432         }
433     }
434 
setAllowRestart(boolean allowRestart)435     public void setAllowRestart(boolean allowRestart) {
436         synchronized (mLock) {
437             mAllowRestart = allowRestart;
438         }
439     }
440 
addMonitor(Monitor monitor)441     public void addMonitor(Monitor monitor) {
442         synchronized (mLock) {
443             mMonitorChecker.addMonitorLocked(monitor);
444         }
445     }
446 
addThread(Handler thread)447     public void addThread(Handler thread) {
448         addThread(thread, DEFAULT_TIMEOUT);
449     }
450 
addThread(Handler thread, long timeoutMillis)451     public void addThread(Handler thread, long timeoutMillis) {
452         synchronized (mLock) {
453             final String name = thread.getLooper().getThread().getName();
454             mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
455         }
456     }
457 
458     /**
459      * Pauses Watchdog action for the currently running thread. Useful before executing long running
460      * operations that could falsely trigger the watchdog. Each call to this will require a matching
461      * call to {@link #resumeWatchingCurrentThread}.
462      *
463      * <p>If the current thread has not been added to the Watchdog, this call is a no-op.
464      *
465      * <p>If the Watchdog is already paused for the current thread, this call adds
466      * adds another pause and will require an additional {@link #resumeCurrentThread} to resume.
467      *
468      * <p>Note: Use with care, as any deadlocks on the current thread will be undetected until all
469      * pauses have been resumed.
470      */
pauseWatchingCurrentThread(String reason)471     public void pauseWatchingCurrentThread(String reason) {
472         synchronized (mLock) {
473             for (HandlerChecker hc : mHandlerCheckers) {
474                 if (Thread.currentThread().equals(hc.getThread())) {
475                     hc.pauseLocked(reason);
476                 }
477             }
478         }
479     }
480 
481     /**
482      * Resumes the last pause from {@link #pauseWatchingCurrentThread} for the currently running
483      * thread.
484      *
485      * <p>If the current thread has not been added to the Watchdog, this call is a no-op.
486      *
487      * <p>If the Watchdog action for the current thread is already resumed, this call logs a wtf.
488      *
489      * <p>If all pauses have been resumed, the Watchdog action is finally resumed, otherwise,
490      * the Watchdog action for the current thread remains paused until resume is called at least
491      * as many times as the calls to pause.
492      */
resumeWatchingCurrentThread(String reason)493     public void resumeWatchingCurrentThread(String reason) {
494         synchronized (mLock) {
495             for (HandlerChecker hc : mHandlerCheckers) {
496                 if (Thread.currentThread().equals(hc.getThread())) {
497                     hc.resumeLocked(reason);
498                 }
499             }
500         }
501     }
502 
503     /**
504      * Perform a full reboot of the system.
505      */
rebootSystem(String reason)506     void rebootSystem(String reason) {
507         Slog.i(TAG, "Rebooting system because: " + reason);
508         IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
509         try {
510             pms.reboot(false, reason, false);
511         } catch (RemoteException ex) {
512         }
513     }
514 
evaluateCheckerCompletionLocked()515     private int evaluateCheckerCompletionLocked() {
516         int state = COMPLETED;
517         for (int i=0; i<mHandlerCheckers.size(); i++) {
518             HandlerChecker hc = mHandlerCheckers.get(i);
519             state = Math.max(state, hc.getCompletionStateLocked());
520         }
521         return state;
522     }
523 
getBlockedCheckersLocked()524     private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
525         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
526         for (int i=0; i<mHandlerCheckers.size(); i++) {
527             HandlerChecker hc = mHandlerCheckers.get(i);
528             if (hc.isOverdueLocked()) {
529                 checkers.add(hc);
530             }
531         }
532         return checkers;
533     }
534 
describeCheckersLocked(List<HandlerChecker> checkers)535     private String describeCheckersLocked(List<HandlerChecker> checkers) {
536         StringBuilder builder = new StringBuilder(128);
537         for (int i=0; i<checkers.size(); i++) {
538             if (builder.length() > 0) {
539                 builder.append(", ");
540             }
541             builder.append(checkers.get(i).describeBlockedStateLocked());
542         }
543         return builder.toString();
544     }
545 
addInterestingHidlPids(HashSet<Integer> pids)546     private static void addInterestingHidlPids(HashSet<Integer> pids) {
547         try {
548             IServiceManager serviceManager = IServiceManager.getService();
549             ArrayList<IServiceManager.InstanceDebugInfo> dump =
550                     serviceManager.debugDump();
551             for (IServiceManager.InstanceDebugInfo info : dump) {
552                 if (info.pid == IServiceManager.PidConstant.NO_PID) {
553                     continue;
554                 }
555 
556                 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) {
557                     continue;
558                 }
559 
560                 pids.add(info.pid);
561             }
562         } catch (RemoteException e) {
563             Log.w(TAG, e);
564         }
565     }
566 
addInterestingAidlPids(HashSet<Integer> pids)567     private static void addInterestingAidlPids(HashSet<Integer> pids) {
568         ServiceDebugInfo[] infos = ServiceManager.getServiceDebugInfo();
569         if (infos == null) return;
570 
571         for (ServiceDebugInfo info : infos) {
572             for (String prefix : AIDL_INTERFACE_PREFIXES_OF_INTEREST) {
573                 if (info.name.startsWith(prefix)) {
574                     pids.add(info.debugPid);
575                 }
576             }
577         }
578     }
579 
getInterestingNativePids()580     static ArrayList<Integer> getInterestingNativePids() {
581         HashSet<Integer> pids = new HashSet<>();
582         addInterestingAidlPids(pids);
583         addInterestingHidlPids(pids);
584 
585         int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
586         if (nativePids != null) {
587             for (int i : nativePids) {
588                 pids.add(i);
589             }
590         }
591 
592         return new ArrayList<Integer>(pids);
593     }
594 
run()595     private void run() {
596         boolean waitedHalf = false;
597         while (true) {
598             List<HandlerChecker> blockedCheckers = Collections.emptyList();
599             String subject = "";
600             boolean allowRestart = true;
601             int debuggerWasConnected = 0;
602             boolean doWaitedHalfDump = false;
603             final ArrayList<Integer> pids;
604             synchronized (mLock) {
605                 long timeout = CHECK_INTERVAL;
606                 // Make sure we (re)spin the checkers that have become idle within
607                 // this wait-and-check interval
608                 for (int i=0; i<mHandlerCheckers.size(); i++) {
609                     HandlerChecker hc = mHandlerCheckers.get(i);
610                     hc.scheduleCheckLocked();
611                 }
612 
613                 if (debuggerWasConnected > 0) {
614                     debuggerWasConnected--;
615                 }
616 
617                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
618                 // wait while asleep. If the device is asleep then the thing that we are waiting
619                 // to timeout on is asleep as well and won't have a chance to run, causing a false
620                 // positive on when to kill things.
621                 long start = SystemClock.uptimeMillis();
622                 while (timeout > 0) {
623                     if (Debug.isDebuggerConnected()) {
624                         debuggerWasConnected = 2;
625                     }
626                     try {
627                         mLock.wait(timeout);
628                         // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
629                     } catch (InterruptedException e) {
630                         Log.wtf(TAG, e);
631                     }
632                     if (Debug.isDebuggerConnected()) {
633                         debuggerWasConnected = 2;
634                     }
635                     timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
636                 }
637 
638                 final int waitState = evaluateCheckerCompletionLocked();
639                 if (waitState == COMPLETED) {
640                     // The monitors have returned; reset
641                     waitedHalf = false;
642                     continue;
643                 } else if (waitState == WAITING) {
644                     // still waiting but within their configured intervals; back off and recheck
645                     continue;
646                 } else if (waitState == WAITED_HALF) {
647                     if (!waitedHalf) {
648                         Slog.i(TAG, "WAITED_HALF");
649                         waitedHalf = true;
650                         // We've waited half, but we'd need to do the stack trace dump w/o the lock.
651                         pids = new ArrayList<>(mInterestingJavaPids);
652                         doWaitedHalfDump = true;
653                     } else {
654                         continue;
655                     }
656                 } else {
657                     // something is overdue!
658                     blockedCheckers = getBlockedCheckersLocked();
659                     subject = describeCheckersLocked(blockedCheckers);
660                     allowRestart = mAllowRestart;
661                     pids = new ArrayList<>(mInterestingJavaPids);
662                 }
663             } // END synchronized (mLock)
664 
665             if (doWaitedHalfDump) {
666                 // We've waited half the deadlock-detection interval.  Pull a stack
667                 // trace and wait another half.
668                 ActivityManagerService.dumpStackTraces(pids, null, null,
669                         getInterestingNativePids(), null, subject);
670                 continue;
671             }
672 
673             // If we got here, that means that the system is most likely hung.
674             // First collect stack traces from all threads of the system process.
675             // Then kill this process so that the system will restart.
676             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
677 
678             final UUID errorId;
679             if (mTraceErrorLogger.isAddErrorIdEnabled()) {
680                 errorId = mTraceErrorLogger.generateErrorId();
681                 mTraceErrorLogger.addErrorIdToTrace("system_server", errorId);
682             } else {
683                 errorId = null;
684             }
685 
686             // Log the atom as early as possible since it is used as a mechanism to trigger
687             // Perfetto. Ideally, the Perfetto trace capture should happen as close to the
688             // point in time when the Watchdog happens as possible.
689             FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject);
690 
691             long anrTime = SystemClock.uptimeMillis();
692             StringBuilder report = new StringBuilder();
693             report.append(MemoryPressureUtil.currentPsiState());
694             ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false);
695             StringWriter tracesFileException = new StringWriter();
696             final File stack = ActivityManagerService.dumpStackTraces(
697                     pids, processCpuTracker, new SparseArray<>(), getInterestingNativePids(),
698                     tracesFileException, subject);
699 
700             // Give some extra time to make sure the stack traces get written.
701             // The system's been hanging for a minute, another second or two won't hurt much.
702             SystemClock.sleep(5000);
703 
704             processCpuTracker.update();
705             report.append(processCpuTracker.printCurrentState(anrTime));
706             report.append(tracesFileException.getBuffer());
707 
708             // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
709             doSysRq('w');
710             doSysRq('l');
711 
712             // Try to add the error to the dropbox, but assuming that the ActivityManager
713             // itself may be deadlocked.  (which has happened, causing this statement to
714             // deadlock and the watchdog as a whole to be ineffective)
715             Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
716                     public void run() {
717                         // If a watched thread hangs before init() is called, we don't have a
718                         // valid mActivity. So we can't log the error to dropbox.
719                         if (mActivity != null) {
720                             mActivity.addErrorToDropBox(
721                                     "watchdog", null, "system_server", null, null, null,
722                                     null, report.toString(), stack, null, null, null,
723                                     errorId);
724                         }
725                     }
726                 };
727             dropboxThread.start();
728             try {
729                 dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
730             } catch (InterruptedException ignored) {}
731 
732             IActivityController controller;
733             synchronized (mLock) {
734                 controller = mController;
735             }
736             if (controller != null) {
737                 Slog.i(TAG, "Reporting stuck state to activity controller");
738                 try {
739                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
740                     // 1 = keep waiting, -1 = kill system
741                     int res = controller.systemNotResponding(subject);
742                     if (res >= 0) {
743                         Slog.i(TAG, "Activity controller requested to coninue to wait");
744                         waitedHalf = false;
745                         continue;
746                     }
747                 } catch (RemoteException e) {
748                 }
749             }
750 
751             // Only kill the process if the debugger is not attached.
752             if (Debug.isDebuggerConnected()) {
753                 debuggerWasConnected = 2;
754             }
755             if (debuggerWasConnected >= 2) {
756                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
757             } else if (debuggerWasConnected > 0) {
758                 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
759             } else if (!allowRestart) {
760                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
761             } else {
762                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
763                 WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
764                 Slog.w(TAG, "*** GOODBYE!");
765                 if (!Build.IS_USER && isCrashLoopFound()
766                         && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
767                     breakCrashLoop();
768                 }
769                 Process.killProcess(Process.myPid());
770                 System.exit(10);
771             }
772 
773             waitedHalf = false;
774         }
775     }
776 
doSysRq(char c)777     private void doSysRq(char c) {
778         try {
779             FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
780             sysrq_trigger.write(c);
781             sysrq_trigger.close();
782         } catch (IOException e) {
783             Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
784         }
785     }
786 
resetTimeoutHistory()787     private void resetTimeoutHistory() {
788         writeTimeoutHistory(new ArrayList<String>());
789     }
790 
writeTimeoutHistory(Iterable<String> crashHistory)791     private void writeTimeoutHistory(Iterable<String> crashHistory) {
792         String data = String.join(",", crashHistory);
793 
794         try (FileWriter writer = new FileWriter(TIMEOUT_HISTORY_FILE)) {
795             writer.write(SystemProperties.get("ro.boottime.zygote"));
796             writer.write(":");
797             writer.write(data);
798         } catch (IOException e) {
799             Slog.e(TAG, "Failed to write file " + TIMEOUT_HISTORY_FILE, e);
800         }
801     }
802 
readTimeoutHistory()803     private String[] readTimeoutHistory() {
804         final String[] emptyStringArray = {};
805 
806         try (BufferedReader reader = new BufferedReader(new FileReader(TIMEOUT_HISTORY_FILE))) {
807             String line = reader.readLine();
808             if (line == null) {
809                 return emptyStringArray;
810             }
811 
812             String[] data = line.trim().split(":");
813             String boottime = data.length >= 1 ? data[0] : "";
814             String history = data.length >= 2 ? data[1] : "";
815             if (SystemProperties.get("ro.boottime.zygote").equals(boottime) && !history.isEmpty()) {
816                 return history.split(",");
817             } else {
818                 return emptyStringArray;
819             }
820         } catch (FileNotFoundException e) {
821             return emptyStringArray;
822         } catch (IOException e) {
823             Slog.e(TAG, "Failed to read file " + TIMEOUT_HISTORY_FILE, e);
824             return emptyStringArray;
825         }
826     }
827 
hasActiveUsbConnection()828     private boolean hasActiveUsbConnection() {
829         try {
830             final String state = FileUtils.readTextFile(
831                     new File("/sys/class/android_usb/android0/state"),
832                     128 /*max*/, null /*ellipsis*/).trim();
833             if ("CONFIGURED".equals(state)) {
834                 return true;
835             }
836         } catch (IOException e) {
837             Slog.w(TAG, "Failed to determine if device was on USB", e);
838         }
839         return false;
840     }
841 
isCrashLoopFound()842     private boolean isCrashLoopFound() {
843         int fatalCount = WatchdogProperties.fatal_count().orElse(0);
844         long fatalWindowMs = TimeUnit.SECONDS.toMillis(
845                 WatchdogProperties.fatal_window_seconds().orElse(0));
846         if (fatalCount == 0 || fatalWindowMs == 0) {
847             if (fatalCount != fatalWindowMs) {
848                 Slog.w(TAG, String.format("sysprops '%s' and '%s' should be set or unset together",
849                             PROP_FATAL_LOOP_COUNT, PROP_FATAL_LOOP_WINDOWS_SECS));
850             }
851             return false;
852         }
853 
854         // new-history = [last (fatalCount - 1) items in old-history] + [nowMs].
855         long nowMs = SystemClock.elapsedRealtime(); // Time since boot including deep sleep.
856         String[] rawCrashHistory = readTimeoutHistory();
857         ArrayList<String> crashHistory = new ArrayList<String>(Arrays.asList(Arrays.copyOfRange(
858                         rawCrashHistory,
859                         Math.max(0, rawCrashHistory.length - fatalCount - 1),
860                         rawCrashHistory.length)));
861         // Something wrong here.
862         crashHistory.add(String.valueOf(nowMs));
863         writeTimeoutHistory(crashHistory);
864 
865         // Returns false if the device has an active USB connection.
866         if (hasActiveUsbConnection()) {
867             return false;
868         }
869 
870         long firstCrashMs;
871         try {
872             firstCrashMs = Long.parseLong(crashHistory.get(0));
873         } catch (NumberFormatException t) {
874             Slog.w(TAG, "Failed to parseLong " + crashHistory.get(0), t);
875             resetTimeoutHistory();
876             return false;
877         }
878         return crashHistory.size() >= fatalCount && nowMs - firstCrashMs < fatalWindowMs;
879     }
880 
breakCrashLoop()881     private void breakCrashLoop() {
882         try (FileWriter kmsg = new FileWriter("/dev/kmsg_debug", /* append= */ true)) {
883             kmsg.append("Fatal reset to escape the system_server crashing loop\n");
884         } catch (IOException e) {
885             Slog.w(TAG, "Failed to append to kmsg", e);
886         }
887         doSysRq('c');
888     }
889 }
890