1 /* 2 * Copyright (C) 2021 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server.pm; 18 19 import android.annotation.Nullable; 20 import android.os.Handler; 21 import android.os.Looper; 22 import android.os.Message; 23 import android.os.SystemClock; 24 import android.text.TextUtils; 25 26 import com.android.internal.annotations.GuardedBy; 27 import com.android.server.EventLogTags; 28 29 import java.io.PrintWriter; 30 import java.util.Arrays; 31 import java.util.Locale; 32 33 /** 34 * This class records statistics about PackageManagerService snapshots. It maintains two sets of 35 * statistics: a periodic set which represents the last 10 minutes, and a cumulative set since 36 * process boot. The key metrics that are recorded are: 37 * <ul> 38 * <li> The time to create a snapshot - this is the performance cost of a snapshot 39 * <li> The lifetime of the snapshot - creation time over lifetime is the amortized cost 40 * <li> The number of times a snapshot is reused - this is the number of times lock 41 * contention was avoided. 42 * </ul> 43 44 * The time conversions in this class are designed to keep arithmetic using ints, rather 45 * than longs. Raw times are supplied as longs in units of us. These are left long. 46 * Rebuild durations however, are converted to ints. An int can express a duration of 47 * approximately 35 minutes. This is longer than any expected snapshot rebuild time, so 48 * an int is satisfactory. The exception is the cumulative rebuild time over the course 49 * of a monitoring cycle: this value is kept long since the cycle time is one week and in 50 * a badly behaved system, the rebuild time might exceed 35 minutes. 51 52 * @hide 53 */ 54 public class SnapshotStatistics { 55 /** 56 * The interval at which statistics should be ticked. It is 60s. The interval is in 57 * units of milliseconds because that is what's required by Handler.sendMessageDelayed(). 58 */ 59 public static final int SNAPSHOT_TICK_INTERVAL_MS = 60 * 1000; 60 61 /** 62 * The number of ticks for long statistics. This is one week. 63 */ 64 public static final int SNAPSHOT_LONG_TICKS = 7 * 24 * 60; 65 66 /** 67 * The number snapshot event logs that can be generated in a single logging interval. 68 * A small number limits the logging generated by this class. A snapshot event log is 69 * generated for every big snapshot build time, up to the limit, or whenever the 70 * maximum build time is exceeded in the logging interval. 71 */ 72 public static final int SNAPSHOT_BUILD_REPORT_LIMIT = 10; 73 74 /** 75 * The number of microseconds in a millisecond. 76 */ 77 private static final int US_IN_MS = 1000; 78 79 /** 80 * A snapshot build time is "big" if it takes longer than 10ms. 81 */ 82 public static final int SNAPSHOT_BIG_BUILD_TIME_US = 10 * US_IN_MS; 83 84 /** 85 * A snapshot build time is reportable if it takes longer than 30ms. Testing shows 86 * that this is very rare. 87 */ 88 public static final int SNAPSHOT_REPORTABLE_BUILD_TIME_US = 30 * US_IN_MS; 89 90 /** 91 * A snapshot is short-lived it used fewer than 5 times. 92 */ 93 public static final int SNAPSHOT_SHORT_LIFETIME = 5; 94 95 /** 96 * The lock to control access to this object. 97 */ 98 private final Object mLock = new Object(); 99 100 /** 101 * The bins for the build time histogram. Values are in us. 102 */ 103 private final BinMap mTimeBins; 104 105 /** 106 * The bins for the snapshot use histogram. 107 */ 108 private final BinMap mUseBins; 109 110 /** 111 * The number of events reported in the current tick. 112 */ 113 private int mEventsReported = 0; 114 115 /** 116 * The tick counter. At the default tick interval, this wraps every 4000 years or so. 117 */ 118 private int mTicks = 0; 119 120 /** 121 * The handler used for the periodic ticks. 122 */ 123 private Handler mHandler = null; 124 125 /** 126 * Convert ns to an int ms. The maximum range of this method is about 24 days. There 127 * is no expectation that an event will take longer than that. 128 */ usToMs(int us)129 private int usToMs(int us) { 130 return us / US_IN_MS; 131 } 132 133 /** 134 * This class exists to provide a fast bin lookup for histograms. An instance has an 135 * integer array that maps incoming values to bins. Values larger than the array are 136 * mapped to the top-most bin. 137 */ 138 private static class BinMap { 139 140 // The number of bins 141 private int mCount; 142 // The mapping of low integers to bins 143 private int[] mBinMap; 144 // The maximum mapped value. Values at or above this are mapped to the 145 // top bin. 146 private int mMaxBin; 147 // A copy of the original key 148 private int[] mUserKey; 149 150 /** 151 * Create a bin map. The input is an array of integers, which must be 152 * monotonically increasing (this is not checked). The result is an integer array 153 * as long as the largest value in the input. 154 */ BinMap(int[] userKey)155 BinMap(int[] userKey) { 156 mUserKey = Arrays.copyOf(userKey, userKey.length); 157 // The number of bins is the length of the keys, plus 1 (for the max). 158 mCount = mUserKey.length + 1; 159 // The maximum value is one more than the last one in the map. 160 mMaxBin = mUserKey[mUserKey.length - 1] + 1; 161 mBinMap = new int[mMaxBin + 1]; 162 163 int j = 0; 164 for (int i = 0; i < mUserKey.length; i++) { 165 while (j <= mUserKey[i]) { 166 mBinMap[j] = i; 167 j++; 168 } 169 } 170 mBinMap[mMaxBin] = mUserKey.length; 171 } 172 173 /** 174 * Map a value to a bin. 175 */ getBin(int x)176 public int getBin(int x) { 177 if (x >= 0 && x < mMaxBin) { 178 return mBinMap[x]; 179 } else if (x >= mMaxBin) { 180 return mBinMap[mMaxBin]; 181 } else { 182 // x is negative. The bin will not be used. 183 return 0; 184 } 185 } 186 187 /** 188 * The number of bins in this map 189 */ count()190 public int count() { 191 return mCount; 192 } 193 194 /** 195 * For convenience, return the user key. 196 */ userKeys()197 public int[] userKeys() { 198 return mUserKey; 199 } 200 } 201 202 /** 203 * A complete set of statistics. These are public, making it simpler for a client to 204 * fetch the individual fields. 205 */ 206 public class Stats { 207 208 /** 209 * The start time for this set of statistics, in us. 210 */ 211 public long mStartTimeUs = 0; 212 213 /** 214 * The completion time for this set of statistics, in ns. A value of zero means 215 * the statistics are still active. 216 */ 217 public long mStopTimeUs = 0; 218 219 /** 220 * The build-time histogram. The total number of rebuilds is the sum over the 221 * histogram entries. 222 */ 223 public int[] mTimes; 224 225 /** 226 * The reuse histogram. The total number of snapshot uses is the sum over the 227 * histogram entries. 228 */ 229 public int[] mUsed; 230 231 /** 232 * The total number of rebuilds. This could be computed by summing over the use 233 * bins, but is maintained separately for convenience. 234 */ 235 public int mTotalBuilds = 0; 236 237 /** 238 * The total number of times any snapshot was used. 239 */ 240 public int mTotalUsed = 0; 241 242 /** 243 * The total number of times a snapshot was bypassed because corking was in effect. 244 */ 245 public int mTotalCorked = 0; 246 247 /** 248 * The total number of builds that count as big, which means they took longer than 249 * SNAPSHOT_BIG_BUILD_TIME_NS. 250 */ 251 public int mBigBuilds = 0; 252 253 /** 254 * The total number of short-lived snapshots 255 */ 256 public int mShortLived = 0; 257 258 /** 259 * The time taken to build snapshots. This is cumulative over the rebuilds 260 * recorded in mRebuilds, so the average time to build a snapshot is given by 261 * mBuildTimeNs/mRebuilds. Note that this cannot be computed from the histogram. 262 */ 263 public long mTotalTimeUs = 0; 264 265 /** 266 * The maximum build time since the last log. 267 */ 268 public int mMaxBuildTimeUs = 0; 269 270 /** 271 * Record the rebuild. The parameters are the length of time it took to build the 272 * latest snapshot, and the number of times the _previous_ snapshot was used. A 273 * negative value for used signals an invalid value, which is the case the first 274 * time a snapshot is every built. 275 */ rebuild(int duration, int used, int buildBin, int useBin, boolean big, boolean quick)276 private void rebuild(int duration, int used, 277 int buildBin, int useBin, boolean big, boolean quick) { 278 mTotalBuilds++; 279 mTimes[buildBin]++; 280 281 if (used >= 0) { 282 mTotalUsed += used; 283 mUsed[useBin]++; 284 } 285 286 mTotalTimeUs += duration; 287 boolean reportIt = false; 288 289 if (big) { 290 mBigBuilds++; 291 } 292 if (quick) { 293 mShortLived++; 294 } 295 if (mMaxBuildTimeUs < duration) { 296 mMaxBuildTimeUs = duration; 297 } 298 } 299 300 /** 301 * Record a cork. 302 */ corked()303 private void corked() { 304 mTotalCorked++; 305 } 306 Stats(long now)307 private Stats(long now) { 308 mStartTimeUs = now; 309 mTimes = new int[mTimeBins.count()]; 310 mUsed = new int[mUseBins.count()]; 311 } 312 313 /** 314 * Create a copy of the argument. The copy is made under lock but can then be 315 * used without holding the lock. 316 */ Stats(Stats orig)317 private Stats(Stats orig) { 318 mStartTimeUs = orig.mStartTimeUs; 319 mStopTimeUs = orig.mStopTimeUs; 320 mTimes = Arrays.copyOf(orig.mTimes, orig.mTimes.length); 321 mUsed = Arrays.copyOf(orig.mUsed, orig.mUsed.length); 322 mTotalBuilds = orig.mTotalBuilds; 323 mTotalUsed = orig.mTotalUsed; 324 mTotalCorked = orig.mTotalCorked; 325 mBigBuilds = orig.mBigBuilds; 326 mShortLived = orig.mShortLived; 327 mTotalTimeUs = orig.mTotalTimeUs; 328 mMaxBuildTimeUs = orig.mMaxBuildTimeUs; 329 } 330 331 /** 332 * Set the end time for the statistics. The end time is used only for reporting 333 * in the dump() method. 334 */ complete(long stop)335 private void complete(long stop) { 336 mStopTimeUs = stop; 337 } 338 339 /** 340 * Format a time span into ddd:HH:MM:SS. The input is in us. 341 */ durationToString(long us)342 private String durationToString(long us) { 343 // s has a range of several years 344 int s = (int) (us / (1000 * 1000)); 345 int m = s / 60; 346 s %= 60; 347 int h = m / 60; 348 m %= 60; 349 int d = h / 24; 350 h %= 24; 351 if (d != 0) { 352 return TextUtils.formatSimple("%2d:%02d:%02d:%02d", d, h, m, s); 353 } else if (h != 0) { 354 return TextUtils.formatSimple("%2s %02d:%02d:%02d", "", h, m, s); 355 } else { 356 return TextUtils.formatSimple("%2s %2s %2d:%02d", "", "", m, s); 357 } 358 } 359 360 /** 361 * Print the prefix for dumping. This does not generate a line to the output. 362 */ dumpPrefix(PrintWriter pw, String indent, long now, boolean header, String title)363 private void dumpPrefix(PrintWriter pw, String indent, long now, boolean header, 364 String title) { 365 pw.print(indent + " "); 366 if (header) { 367 pw.format(Locale.US, "%-23s", title); 368 } else { 369 pw.format(Locale.US, "%11s", durationToString(now - mStartTimeUs)); 370 if (mStopTimeUs != 0) { 371 pw.format(Locale.US, " %11s", durationToString(now - mStopTimeUs)); 372 } else { 373 pw.format(Locale.US, " %11s", "now"); 374 } 375 } 376 } 377 378 /** 379 * Dump the summary statistics record. Choose the header or the data. 380 * number of builds 381 * number of uses 382 * number of corks 383 * number of big builds 384 * number of short lifetimes 385 * cumulative build time, in seconds 386 * maximum build time, in ms 387 */ dumpStats(PrintWriter pw, String indent, long now, boolean header)388 private void dumpStats(PrintWriter pw, String indent, long now, boolean header) { 389 dumpPrefix(pw, indent, now, header, "Summary stats"); 390 if (header) { 391 pw.format(Locale.US, " %10s %10s %10s %10s %10s %10s %10s", 392 "TotBlds", "TotUsed", "TotCork", "BigBlds", "ShortLvd", 393 "TotTime", "MaxTime"); 394 } else { 395 pw.format(Locale.US, 396 " %10d %10d %10d %10d %10d %10d %10d", 397 mTotalBuilds, mTotalUsed, mTotalCorked, mBigBuilds, mShortLived, 398 mTotalTimeUs / 1000, mMaxBuildTimeUs / 1000); 399 } 400 pw.println(); 401 } 402 403 /** 404 * Dump the build time histogram. Choose the header or the data. 405 */ dumpTimes(PrintWriter pw, String indent, long now, boolean header)406 private void dumpTimes(PrintWriter pw, String indent, long now, boolean header) { 407 dumpPrefix(pw, indent, now, header, "Build times"); 408 if (header) { 409 int[] keys = mTimeBins.userKeys(); 410 for (int i = 0; i < keys.length; i++) { 411 pw.format(Locale.US, " %10s", 412 TextUtils.formatSimple("<= %dms", keys[i])); 413 } 414 pw.format(Locale.US, " %10s", 415 TextUtils.formatSimple("> %dms", keys[keys.length - 1])); 416 } else { 417 for (int i = 0; i < mTimes.length; i++) { 418 pw.format(Locale.US, " %10d", mTimes[i]); 419 } 420 } 421 pw.println(); 422 } 423 424 /** 425 * Dump the usage histogram. Choose the header or the data. 426 */ dumpUsage(PrintWriter pw, String indent, long now, boolean header)427 private void dumpUsage(PrintWriter pw, String indent, long now, boolean header) { 428 dumpPrefix(pw, indent, now, header, "Use counters"); 429 if (header) { 430 int[] keys = mUseBins.userKeys(); 431 for (int i = 0; i < keys.length; i++) { 432 pw.format(Locale.US, " %10s", TextUtils.formatSimple("<= %d", keys[i])); 433 } 434 pw.format(Locale.US, " %10s", 435 TextUtils.formatSimple("> %d", keys[keys.length - 1])); 436 } else { 437 for (int i = 0; i < mUsed.length; i++) { 438 pw.format(Locale.US, " %10d", mUsed[i]); 439 } 440 } 441 pw.println(); 442 } 443 444 /** 445 * Dump something, based on the "what" parameter. 446 */ dump(PrintWriter pw, String indent, long now, boolean header, String what)447 private void dump(PrintWriter pw, String indent, long now, boolean header, String what) { 448 if (what.equals("stats")) { 449 dumpStats(pw, indent, now, header); 450 } else if (what.equals("times")) { 451 dumpTimes(pw, indent, now, header); 452 } else if (what.equals("usage")) { 453 dumpUsage(pw, indent, now, header); 454 } else { 455 throw new IllegalArgumentException("unrecognized choice: " + what); 456 } 457 } 458 459 /** 460 * Report the object via an event. Presumably the record indicates an anomalous 461 * incident. 462 */ report()463 private void report() { 464 EventLogTags.writePmSnapshotStats( 465 mTotalBuilds, mTotalUsed, mBigBuilds, mShortLived, 466 mMaxBuildTimeUs / US_IN_MS, mTotalTimeUs / US_IN_MS); 467 } 468 } 469 470 /** 471 * Long statistics. These roll over approximately every week. 472 */ 473 private Stats[] mLong; 474 475 /** 476 * Short statistics. These roll over approximately every minute; 477 */ 478 private Stats[] mShort; 479 480 /** 481 * The time of the last build. This can be used to compute the length of time a 482 * snapshot existed before being replaced. 483 */ 484 private long mLastBuildTime = 0; 485 486 /** 487 * Create a snapshot object. Initialize the bin levels. The last bin catches 488 * everything that is not caught earlier, so its value is not really important. 489 */ SnapshotStatistics()490 public SnapshotStatistics() { 491 // Create the bin thresholds. The time bins are in units of us. 492 mTimeBins = new BinMap(new int[] { 1, 2, 5, 10, 20, 50, 100 }); 493 mUseBins = new BinMap(new int[] { 1, 2, 5, 10, 20, 50, 100 }); 494 495 // Create the raw statistics 496 final long now = SystemClock.currentTimeMicro(); 497 mLong = new Stats[2]; 498 mLong[0] = new Stats(now); 499 mShort = new Stats[10]; 500 mShort[0] = new Stats(now); 501 502 // Create the message handler for ticks and start the ticker. 503 mHandler = new Handler(Looper.getMainLooper()) { 504 @Override 505 public void handleMessage(Message msg) { 506 SnapshotStatistics.this.handleMessage(msg); 507 } 508 }; 509 scheduleTick(); 510 } 511 512 /** 513 * Handle a message. The only messages are ticks, so the message parameter is ignored. 514 */ handleMessage(@ullable Message msg)515 private void handleMessage(@Nullable Message msg) { 516 tick(); 517 scheduleTick(); 518 } 519 520 /** 521 * Schedule one tick, a tick interval in the future. 522 */ scheduleTick()523 private void scheduleTick() { 524 mHandler.sendEmptyMessageDelayed(0, SNAPSHOT_TICK_INTERVAL_MS); 525 } 526 527 /** 528 * Record a rebuild. Cumulative and current statistics are updated. Events may be 529 * generated. 530 * @param now The time at which the snapshot rebuild began, in ns. 531 * @param done The time at which the snapshot rebuild completed, in ns. 532 * @param hits The number of times the previous snapshot was used. 533 */ rebuild(long now, long done, int hits)534 public final void rebuild(long now, long done, int hits) { 535 // The duration has a span of about 2000s 536 final int duration = (int) (done - now); 537 boolean reportEvent = false; 538 synchronized (mLock) { 539 mLastBuildTime = now; 540 541 final int timeBin = mTimeBins.getBin(duration / 1000); 542 final int useBin = mUseBins.getBin(hits); 543 final boolean big = duration >= SNAPSHOT_BIG_BUILD_TIME_US; 544 final boolean quick = hits <= SNAPSHOT_SHORT_LIFETIME; 545 546 mShort[0].rebuild(duration, hits, timeBin, useBin, big, quick); 547 mLong[0].rebuild(duration, hits, timeBin, useBin, big, quick); 548 if (duration >= SNAPSHOT_REPORTABLE_BUILD_TIME_US) { 549 if (mEventsReported++ < SNAPSHOT_BUILD_REPORT_LIMIT) { 550 reportEvent = true; 551 } 552 } 553 } 554 // The IO to the logger is done outside the lock. 555 if (reportEvent) { 556 // Report the first N big builds, and every new maximum after that. 557 EventLogTags.writePmSnapshotRebuild(duration / US_IN_MS, hits); 558 } 559 } 560 561 /** 562 * Record a corked snapshot request. 563 */ corked()564 public final void corked() { 565 synchronized (mLock) { 566 mShort[0].corked(); 567 mLong[0].corked(); 568 } 569 } 570 571 /** 572 * Roll a stats array. Shift the elements up an index and create a new element at 573 * index zero. The old element zero is completed with the specified time. 574 */ 575 @GuardedBy("mLock") shift(Stats[] s, long now)576 private void shift(Stats[] s, long now) { 577 s[0].complete(now); 578 for (int i = s.length - 1; i > 0; i--) { 579 s[i] = s[i - 1]; 580 } 581 s[0] = new Stats(now); 582 } 583 584 /** 585 * Roll the statistics. 586 * <ul> 587 * <li> Roll the quick statistics immediately. 588 * <li> Roll the long statistics every SNAPSHOT_LONG_TICKER ticks. The long 589 * statistics hold a week's worth of data. 590 * <li> Roll the logging statistics every SNAPSHOT_LOGGING_TICKER ticks. The logging 591 * statistics hold 10 minutes worth of data. 592 * </ul> 593 */ tick()594 private void tick() { 595 synchronized (mLock) { 596 long now = SystemClock.currentTimeMicro(); 597 mTicks++; 598 if (mTicks % SNAPSHOT_LONG_TICKS == 0) { 599 shift(mLong, now); 600 } 601 shift(mShort, now); 602 mEventsReported = 0; 603 } 604 } 605 606 /** 607 * Dump the statistics. The header is dumped from l[0], so that must not be null. 608 */ dump(PrintWriter pw, String indent, long now, Stats[] l, Stats[] s, String what)609 private void dump(PrintWriter pw, String indent, long now, Stats[] l, Stats[] s, String what) { 610 l[0].dump(pw, indent, now, true, what); 611 for (int i = 0; i < s.length; i++) { 612 if (s[i] != null) { 613 s[i].dump(pw, indent, now, false, what); 614 } 615 } 616 for (int i = 0; i < l.length; i++) { 617 if (l[i] != null) { 618 l[i].dump(pw, indent, now, false, what); 619 } 620 } 621 } 622 623 /** 624 * Dump the statistics. The format is compatible with the PackageManager dumpsys 625 * output. 626 */ dump(PrintWriter pw, String indent, long now, int unrecorded, int corkLevel, boolean brief)627 public void dump(PrintWriter pw, String indent, long now, int unrecorded, 628 int corkLevel, boolean brief) { 629 // Grab the raw statistics under lock, but print them outside of the lock. 630 Stats[] l; 631 Stats[] s; 632 synchronized (mLock) { 633 l = Arrays.copyOf(mLong, mLong.length); 634 l[0] = new Stats(l[0]); 635 s = Arrays.copyOf(mShort, mShort.length); 636 s[0] = new Stats(s[0]); 637 } 638 pw.format(Locale.US, "%s Unrecorded-hits: %d Cork-level: %d", indent, 639 unrecorded, corkLevel); 640 pw.println(); 641 dump(pw, indent, now, l, s, "stats"); 642 if (brief) { 643 return; 644 } 645 pw.println(); 646 dump(pw, indent, now, l, s, "times"); 647 pw.println(); 648 dump(pw, indent, now, l, s, "usage"); 649 } 650 } 651