1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server.pm;
18 
19 import android.annotation.Nullable;
20 import android.os.Handler;
21 import android.os.Looper;
22 import android.os.Message;
23 import android.os.SystemClock;
24 import android.text.TextUtils;
25 
26 import com.android.internal.annotations.GuardedBy;
27 import com.android.server.EventLogTags;
28 
29 import java.io.PrintWriter;
30 import java.util.Arrays;
31 import java.util.Locale;
32 
33 /**
34  * This class records statistics about PackageManagerService snapshots.  It maintains two sets of
35  * statistics: a periodic set which represents the last 10 minutes, and a cumulative set since
36  * process boot.  The key metrics that are recorded are:
37  * <ul>
38  * <li> The time to create a snapshot - this is the performance cost of a snapshot
39  * <li> The lifetime of the snapshot - creation time over lifetime is the amortized cost
40  * <li> The number of times a snapshot is reused - this is the number of times lock
41  *      contention was avoided.
42  * </ul>
43 
44  * The time conversions in this class are designed to keep arithmetic using ints, rather
45  * than longs.  Raw times are supplied as longs in units of us.  These are left long.
46  * Rebuild durations however, are converted to ints.  An int can express a duration of
47  * approximately 35 minutes.  This is longer than any expected snapshot rebuild time, so
48  * an int is satisfactory.  The exception is the cumulative rebuild time over the course
49  * of a monitoring cycle: this value is kept long since the cycle time is one week and in
50  * a badly behaved system, the rebuild time might exceed 35 minutes.
51 
52  * @hide
53  */
54 public class SnapshotStatistics {
55     /**
56      * The interval at which statistics should be ticked.  It is 60s.  The interval is in
57      * units of milliseconds because that is what's required by Handler.sendMessageDelayed().
58      */
59     public static final int SNAPSHOT_TICK_INTERVAL_MS = 60 * 1000;
60 
61     /**
62      * The number of ticks for long statistics.  This is one week.
63      */
64     public static final int SNAPSHOT_LONG_TICKS = 7 * 24 * 60;
65 
66     /**
67      * The number snapshot event logs that can be generated in a single logging interval.
68      * A small number limits the logging generated by this class.  A snapshot event log is
69      * generated for every big snapshot build time, up to the limit, or whenever the
70      * maximum build time is exceeded in the logging interval.
71      */
72     public static final int SNAPSHOT_BUILD_REPORT_LIMIT = 10;
73 
74     /**
75      * The number of microseconds in a millisecond.
76      */
77     private static final int US_IN_MS = 1000;
78 
79     /**
80      * A snapshot build time is "big" if it takes longer than 10ms.
81      */
82     public static final int SNAPSHOT_BIG_BUILD_TIME_US = 10 * US_IN_MS;
83 
84     /**
85      * A snapshot build time is reportable if it takes longer than 30ms.  Testing shows
86      * that this is very rare.
87      */
88     public static final int SNAPSHOT_REPORTABLE_BUILD_TIME_US = 30 * US_IN_MS;
89 
90     /**
91      * A snapshot is short-lived it used fewer than 5 times.
92      */
93     public static final int SNAPSHOT_SHORT_LIFETIME = 5;
94 
95     /**
96      * The lock to control access to this object.
97      */
98     private final Object mLock = new Object();
99 
100     /**
101      * The bins for the build time histogram.  Values are in us.
102      */
103     private final BinMap mTimeBins;
104 
105     /**
106      * The bins for the snapshot use histogram.
107      */
108     private final BinMap mUseBins;
109 
110     /**
111      * The number of events reported in the current tick.
112      */
113     private int mEventsReported = 0;
114 
115     /**
116      * The tick counter.  At the default tick interval, this wraps every 4000 years or so.
117      */
118     private int mTicks = 0;
119 
120     /**
121      * The handler used for the periodic ticks.
122      */
123     private Handler mHandler = null;
124 
125     /**
126      * Convert ns to an int ms.  The maximum range of this method is about 24 days.  There
127      * is no expectation that an event will take longer than that.
128      */
usToMs(int us)129     private int usToMs(int us) {
130         return us / US_IN_MS;
131     }
132 
133     /**
134      * This class exists to provide a fast bin lookup for histograms.  An instance has an
135      * integer array that maps incoming values to bins.  Values larger than the array are
136      * mapped to the top-most bin.
137      */
138     private static class BinMap {
139 
140         // The number of bins
141         private int mCount;
142         // The mapping of low integers to bins
143         private int[] mBinMap;
144         // The maximum mapped value.  Values at or above this are mapped to the
145         // top bin.
146         private int mMaxBin;
147         // A copy of the original key
148         private int[] mUserKey;
149 
150         /**
151          * Create a bin map.  The input is an array of integers, which must be
152          * monotonically increasing (this is not checked).  The result is an integer array
153          * as long as the largest value in the input.
154          */
BinMap(int[] userKey)155         BinMap(int[] userKey) {
156             mUserKey = Arrays.copyOf(userKey, userKey.length);
157             // The number of bins is the length of the keys, plus 1 (for the max).
158             mCount = mUserKey.length + 1;
159             // The maximum value is one more than the last one in the map.
160             mMaxBin = mUserKey[mUserKey.length - 1] + 1;
161             mBinMap = new int[mMaxBin + 1];
162 
163             int j = 0;
164             for (int i = 0; i < mUserKey.length; i++) {
165                 while (j <= mUserKey[i]) {
166                     mBinMap[j] = i;
167                     j++;
168                 }
169             }
170             mBinMap[mMaxBin] = mUserKey.length;
171         }
172 
173         /**
174          * Map a value to a bin.
175          */
getBin(int x)176         public int getBin(int x) {
177             if (x >= 0 && x < mMaxBin) {
178                 return mBinMap[x];
179             } else if (x >= mMaxBin) {
180                 return mBinMap[mMaxBin];
181             } else {
182                 // x is negative.  The bin will not be used.
183                 return 0;
184             }
185         }
186 
187         /**
188          * The number of bins in this map
189          */
count()190         public int count() {
191             return mCount;
192         }
193 
194         /**
195          * For convenience, return the user key.
196          */
userKeys()197         public int[] userKeys() {
198             return mUserKey;
199         }
200     }
201 
202     /**
203      * A complete set of statistics.  These are public, making it simpler for a client to
204      * fetch the individual fields.
205      */
206     public class Stats {
207 
208         /**
209          * The start time for this set of statistics, in us.
210          */
211         public long mStartTimeUs = 0;
212 
213         /**
214          * The completion time for this set of statistics, in ns.  A value of zero means
215          * the statistics are still active.
216          */
217         public long mStopTimeUs = 0;
218 
219         /**
220          * The build-time histogram.  The total number of rebuilds is the sum over the
221          * histogram entries.
222          */
223         public int[] mTimes;
224 
225         /**
226          * The reuse histogram.  The total number of snapshot uses is the sum over the
227          * histogram entries.
228          */
229         public int[] mUsed;
230 
231         /**
232          * The total number of rebuilds.  This could be computed by summing over the use
233          * bins, but is maintained separately for convenience.
234          */
235         public int mTotalBuilds = 0;
236 
237         /**
238          * The total number of times any snapshot was used.
239          */
240         public int mTotalUsed = 0;
241 
242         /**
243          * The total number of times a snapshot was bypassed because corking was in effect.
244          */
245         public int mTotalCorked = 0;
246 
247         /**
248          * The total number of builds that count as big, which means they took longer than
249          * SNAPSHOT_BIG_BUILD_TIME_NS.
250          */
251         public int mBigBuilds = 0;
252 
253         /**
254          * The total number of short-lived snapshots
255          */
256         public int mShortLived = 0;
257 
258         /**
259          * The time taken to build snapshots.  This is cumulative over the rebuilds
260          * recorded in mRebuilds, so the average time to build a snapshot is given by
261          * mBuildTimeNs/mRebuilds.  Note that this cannot be computed from the histogram.
262          */
263         public long mTotalTimeUs = 0;
264 
265         /**
266          * The maximum build time since the last log.
267          */
268         public int mMaxBuildTimeUs = 0;
269 
270         /**
271          * Record the rebuild.  The parameters are the length of time it took to build the
272          * latest snapshot, and the number of times the _previous_ snapshot was used.  A
273          * negative value for used signals an invalid value, which is the case the first
274          * time a snapshot is every built.
275          */
rebuild(int duration, int used, int buildBin, int useBin, boolean big, boolean quick)276         private void rebuild(int duration, int used,
277                 int buildBin, int useBin, boolean big, boolean quick) {
278             mTotalBuilds++;
279             mTimes[buildBin]++;
280 
281             if (used >= 0) {
282                 mTotalUsed += used;
283                 mUsed[useBin]++;
284             }
285 
286             mTotalTimeUs += duration;
287             boolean reportIt = false;
288 
289             if (big) {
290                 mBigBuilds++;
291             }
292             if (quick) {
293                 mShortLived++;
294             }
295             if (mMaxBuildTimeUs < duration) {
296                 mMaxBuildTimeUs = duration;
297             }
298         }
299 
300         /**
301          * Record a cork.
302          */
corked()303         private void corked() {
304             mTotalCorked++;
305         }
306 
Stats(long now)307         private Stats(long now) {
308             mStartTimeUs = now;
309             mTimes = new int[mTimeBins.count()];
310             mUsed = new int[mUseBins.count()];
311         }
312 
313         /**
314          * Create a copy of the argument.  The copy is made under lock but can then be
315          * used without holding the lock.
316          */
Stats(Stats orig)317         private Stats(Stats orig) {
318             mStartTimeUs = orig.mStartTimeUs;
319             mStopTimeUs = orig.mStopTimeUs;
320             mTimes = Arrays.copyOf(orig.mTimes, orig.mTimes.length);
321             mUsed = Arrays.copyOf(orig.mUsed, orig.mUsed.length);
322             mTotalBuilds = orig.mTotalBuilds;
323             mTotalUsed = orig.mTotalUsed;
324             mTotalCorked = orig.mTotalCorked;
325             mBigBuilds = orig.mBigBuilds;
326             mShortLived = orig.mShortLived;
327             mTotalTimeUs = orig.mTotalTimeUs;
328             mMaxBuildTimeUs = orig.mMaxBuildTimeUs;
329         }
330 
331         /**
332          * Set the end time for the statistics.  The end time is used only for reporting
333          * in the dump() method.
334          */
complete(long stop)335         private void complete(long stop) {
336             mStopTimeUs = stop;
337         }
338 
339         /**
340          * Format a time span into ddd:HH:MM:SS.  The input is in us.
341          */
durationToString(long us)342         private String durationToString(long us) {
343             // s has a range of several years
344             int s = (int) (us / (1000 * 1000));
345             int m = s / 60;
346             s %= 60;
347             int h = m / 60;
348             m %= 60;
349             int d = h / 24;
350             h %= 24;
351             if (d != 0) {
352                 return TextUtils.formatSimple("%2d:%02d:%02d:%02d", d, h, m, s);
353             } else if (h != 0) {
354                 return TextUtils.formatSimple("%2s %02d:%02d:%02d", "", h, m, s);
355             } else {
356                 return TextUtils.formatSimple("%2s %2s %2d:%02d", "", "", m, s);
357             }
358         }
359 
360         /**
361          * Print the prefix for dumping.  This does not generate a line to the output.
362          */
dumpPrefix(PrintWriter pw, String indent, long now, boolean header, String title)363         private void dumpPrefix(PrintWriter pw, String indent, long now, boolean header,
364                                 String title) {
365             pw.print(indent + " ");
366             if (header) {
367                 pw.format(Locale.US, "%-23s", title);
368             } else {
369                 pw.format(Locale.US, "%11s", durationToString(now - mStartTimeUs));
370                 if (mStopTimeUs != 0) {
371                     pw.format(Locale.US, " %11s", durationToString(now - mStopTimeUs));
372                 } else {
373                     pw.format(Locale.US, " %11s", "now");
374                 }
375             }
376         }
377 
378         /**
379          * Dump the summary statistics record.  Choose the header or the data.
380          *    number of builds
381          *    number of uses
382          *    number of corks
383          *    number of big builds
384          *    number of short lifetimes
385          *    cumulative build time, in seconds
386          *    maximum build time, in ms
387          */
dumpStats(PrintWriter pw, String indent, long now, boolean header)388         private void dumpStats(PrintWriter pw, String indent, long now, boolean header) {
389             dumpPrefix(pw, indent, now, header, "Summary stats");
390             if (header) {
391                 pw.format(Locale.US, "  %10s  %10s  %10s  %10s  %10s  %10s  %10s",
392                           "TotBlds", "TotUsed", "TotCork", "BigBlds", "ShortLvd",
393                           "TotTime", "MaxTime");
394             } else {
395                 pw.format(Locale.US,
396                         "  %10d  %10d  %10d  %10d  %10d  %10d  %10d",
397                         mTotalBuilds, mTotalUsed, mTotalCorked, mBigBuilds, mShortLived,
398                         mTotalTimeUs / 1000, mMaxBuildTimeUs / 1000);
399             }
400             pw.println();
401         }
402 
403         /**
404          * Dump the build time histogram.  Choose the header or the data.
405          */
dumpTimes(PrintWriter pw, String indent, long now, boolean header)406         private void dumpTimes(PrintWriter pw, String indent, long now, boolean header) {
407             dumpPrefix(pw, indent, now, header, "Build times");
408             if (header) {
409                 int[] keys = mTimeBins.userKeys();
410                 for (int i = 0; i < keys.length; i++) {
411                     pw.format(Locale.US, "  %10s",
412                             TextUtils.formatSimple("<= %dms", keys[i]));
413                 }
414                 pw.format(Locale.US, "  %10s",
415                         TextUtils.formatSimple("> %dms", keys[keys.length - 1]));
416             } else {
417                 for (int i = 0; i < mTimes.length; i++) {
418                     pw.format(Locale.US, "  %10d", mTimes[i]);
419                 }
420             }
421             pw.println();
422         }
423 
424         /**
425          * Dump the usage histogram.  Choose the header or the data.
426          */
dumpUsage(PrintWriter pw, String indent, long now, boolean header)427         private void dumpUsage(PrintWriter pw, String indent, long now, boolean header) {
428             dumpPrefix(pw, indent, now, header, "Use counters");
429             if (header) {
430                 int[] keys = mUseBins.userKeys();
431                 for (int i = 0; i < keys.length; i++) {
432                     pw.format(Locale.US, "  %10s", TextUtils.formatSimple("<= %d", keys[i]));
433                 }
434                 pw.format(Locale.US, "  %10s",
435                         TextUtils.formatSimple("> %d", keys[keys.length - 1]));
436             } else {
437                 for (int i = 0; i < mUsed.length; i++) {
438                     pw.format(Locale.US, "  %10d", mUsed[i]);
439                 }
440             }
441             pw.println();
442         }
443 
444         /**
445          * Dump something, based on the "what" parameter.
446          */
dump(PrintWriter pw, String indent, long now, boolean header, String what)447         private void dump(PrintWriter pw, String indent, long now, boolean header, String what) {
448             if (what.equals("stats")) {
449                 dumpStats(pw, indent, now, header);
450             } else if (what.equals("times")) {
451                 dumpTimes(pw, indent, now, header);
452             } else if (what.equals("usage")) {
453                 dumpUsage(pw, indent, now, header);
454             } else {
455                 throw new IllegalArgumentException("unrecognized choice: " + what);
456             }
457         }
458 
459         /**
460          * Report the object via an event.  Presumably the record indicates an anomalous
461          * incident.
462          */
report()463         private void report() {
464             EventLogTags.writePmSnapshotStats(
465                     mTotalBuilds, mTotalUsed, mBigBuilds, mShortLived,
466                     mMaxBuildTimeUs / US_IN_MS, mTotalTimeUs / US_IN_MS);
467         }
468     }
469 
470     /**
471      * Long statistics.  These roll over approximately every week.
472      */
473     private Stats[] mLong;
474 
475     /**
476      * Short statistics.  These roll over approximately every minute;
477      */
478     private Stats[] mShort;
479 
480     /**
481      * The time of the last build.  This can be used to compute the length of time a
482      * snapshot existed before being replaced.
483      */
484     private long mLastBuildTime = 0;
485 
486     /**
487      * Create a snapshot object.  Initialize the bin levels.  The last bin catches
488      * everything that is not caught earlier, so its value is not really important.
489      */
SnapshotStatistics()490     public SnapshotStatistics() {
491         // Create the bin thresholds.  The time bins are in units of us.
492         mTimeBins = new BinMap(new int[] { 1, 2, 5, 10, 20, 50, 100 });
493         mUseBins = new BinMap(new int[] { 1, 2, 5, 10, 20, 50, 100 });
494 
495         // Create the raw statistics
496         final long now = SystemClock.currentTimeMicro();
497         mLong = new Stats[2];
498         mLong[0] = new Stats(now);
499         mShort = new Stats[10];
500         mShort[0] = new Stats(now);
501 
502         // Create the message handler for ticks and start the ticker.
503         mHandler = new Handler(Looper.getMainLooper()) {
504                 @Override
505                 public void handleMessage(Message msg) {
506                     SnapshotStatistics.this.handleMessage(msg);
507                 }
508             };
509         scheduleTick();
510     }
511 
512     /**
513      * Handle a message.  The only messages are ticks, so the message parameter is ignored.
514      */
handleMessage(@ullable Message msg)515     private void handleMessage(@Nullable Message msg) {
516         tick();
517         scheduleTick();
518     }
519 
520     /**
521      * Schedule one tick, a tick interval in the future.
522      */
scheduleTick()523     private void scheduleTick() {
524         mHandler.sendEmptyMessageDelayed(0, SNAPSHOT_TICK_INTERVAL_MS);
525     }
526 
527     /**
528      * Record a rebuild.  Cumulative and current statistics are updated.  Events may be
529      * generated.
530      * @param now The time at which the snapshot rebuild began, in ns.
531      * @param done The time at which the snapshot rebuild completed, in ns.
532      * @param hits The number of times the previous snapshot was used.
533      */
rebuild(long now, long done, int hits)534     public final void rebuild(long now, long done, int hits) {
535         // The duration has a span of about 2000s
536         final int duration = (int) (done - now);
537         boolean reportEvent = false;
538         synchronized (mLock) {
539             mLastBuildTime = now;
540 
541             final int timeBin = mTimeBins.getBin(duration / 1000);
542             final int useBin = mUseBins.getBin(hits);
543             final boolean big = duration >= SNAPSHOT_BIG_BUILD_TIME_US;
544             final boolean quick = hits <= SNAPSHOT_SHORT_LIFETIME;
545 
546             mShort[0].rebuild(duration, hits, timeBin, useBin, big, quick);
547             mLong[0].rebuild(duration, hits, timeBin, useBin, big, quick);
548             if (duration >= SNAPSHOT_REPORTABLE_BUILD_TIME_US) {
549                 if (mEventsReported++ < SNAPSHOT_BUILD_REPORT_LIMIT) {
550                     reportEvent = true;
551                 }
552             }
553         }
554         // The IO to the logger is done outside the lock.
555         if (reportEvent) {
556             // Report the first N big builds, and every new maximum after that.
557             EventLogTags.writePmSnapshotRebuild(duration / US_IN_MS, hits);
558         }
559     }
560 
561     /**
562      * Record a corked snapshot request.
563      */
corked()564     public final void corked() {
565         synchronized (mLock) {
566             mShort[0].corked();
567             mLong[0].corked();
568         }
569     }
570 
571     /**
572      * Roll a stats array.  Shift the elements up an index and create a new element at
573      * index zero.  The old element zero is completed with the specified time.
574      */
575     @GuardedBy("mLock")
shift(Stats[] s, long now)576     private void shift(Stats[] s, long now) {
577         s[0].complete(now);
578         for (int i = s.length - 1; i > 0; i--) {
579             s[i] = s[i - 1];
580         }
581         s[0] = new Stats(now);
582     }
583 
584     /**
585      * Roll the statistics.
586      * <ul>
587      * <li> Roll the quick statistics immediately.
588      * <li> Roll the long statistics every SNAPSHOT_LONG_TICKER ticks.  The long
589      * statistics hold a week's worth of data.
590      * <li> Roll the logging statistics every SNAPSHOT_LOGGING_TICKER ticks.  The logging
591      * statistics hold 10 minutes worth of data.
592      * </ul>
593      */
tick()594     private void tick() {
595         synchronized (mLock) {
596             long now = SystemClock.currentTimeMicro();
597             mTicks++;
598             if (mTicks % SNAPSHOT_LONG_TICKS == 0) {
599                 shift(mLong, now);
600             }
601             shift(mShort, now);
602             mEventsReported = 0;
603         }
604     }
605 
606     /**
607      * Dump the statistics.  The header is dumped from l[0], so that must not be null.
608      */
dump(PrintWriter pw, String indent, long now, Stats[] l, Stats[] s, String what)609     private void dump(PrintWriter pw, String indent, long now, Stats[] l, Stats[] s, String what) {
610         l[0].dump(pw, indent, now, true, what);
611         for (int i = 0; i < s.length; i++) {
612             if (s[i] != null) {
613                 s[i].dump(pw, indent, now, false, what);
614             }
615         }
616         for (int i = 0; i < l.length; i++) {
617             if (l[i] != null) {
618                 l[i].dump(pw, indent, now, false, what);
619             }
620         }
621     }
622 
623     /**
624      * Dump the statistics.  The format is compatible with the PackageManager dumpsys
625      * output.
626      */
dump(PrintWriter pw, String indent, long now, int unrecorded, int corkLevel, boolean brief)627     public void dump(PrintWriter pw, String indent, long now, int unrecorded,
628                      int corkLevel, boolean brief) {
629         // Grab the raw statistics under lock, but print them outside of the lock.
630         Stats[] l;
631         Stats[] s;
632         synchronized (mLock) {
633             l = Arrays.copyOf(mLong, mLong.length);
634             l[0] = new Stats(l[0]);
635             s = Arrays.copyOf(mShort, mShort.length);
636             s[0] = new Stats(s[0]);
637         }
638         pw.format(Locale.US, "%s Unrecorded-hits: %d  Cork-level: %d", indent,
639                   unrecorded, corkLevel);
640         pw.println();
641         dump(pw, indent, now, l, s, "stats");
642         if (brief) {
643             return;
644         }
645         pw.println();
646         dump(pw, indent, now, l, s, "times");
647         pw.println();
648         dump(pw, indent, now, l, s, "usage");
649     }
650 }
651