1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "lowmemorykiller"
18 
19 #include <dirent.h>
20 #include <errno.h>
21 #include <inttypes.h>
22 #include <pwd.h>
23 #include <sched.h>
24 #include <signal.h>
25 #include <stdbool.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <sys/cdefs.h>
29 #include <sys/epoll.h>
30 #include <sys/eventfd.h>
31 #include <sys/mman.h>
32 #include <sys/pidfd.h>
33 #include <sys/resource.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/time.h>
38 #include <sys/types.h>
39 #include <time.h>
40 #include <unistd.h>
41 
42 #include <cutils/properties.h>
43 #include <cutils/sockets.h>
44 #include <liblmkd_utils.h>
45 #include <lmkd.h>
46 #include <log/log.h>
47 #include <log/log_event_list.h>
48 #include <log/log_time.h>
49 #include <private/android_filesystem_config.h>
50 #include <processgroup/processgroup.h>
51 #include <psi/psi.h>
52 #include <system/thread_defs.h>
53 
54 #include "statslog.h"
55 
56 #define BPF_FD_JUST_USE_INT
57 #include "BpfSyscallWrappers.h"
58 
59 /*
60  * Define LMKD_TRACE_KILLS to record lmkd kills in kernel traces
61  * to profile and correlate with OOM kills
62  */
63 #ifdef LMKD_TRACE_KILLS
64 
65 #define ATRACE_TAG ATRACE_TAG_ALWAYS
66 #include <cutils/trace.h>
67 
68 #define TRACE_KILL_START(pid) ATRACE_INT(__FUNCTION__, pid);
69 #define TRACE_KILL_END()      ATRACE_INT(__FUNCTION__, 0);
70 
71 #else /* LMKD_TRACE_KILLS */
72 
73 #define TRACE_KILL_START(pid) ((void)(pid))
74 #define TRACE_KILL_END() ((void)0)
75 
76 #endif /* LMKD_TRACE_KILLS */
77 
78 #ifndef __unused
79 #define __unused __attribute__((__unused__))
80 #endif
81 
82 #define MEMCG_SYSFS_PATH "/dev/memcg/"
83 #define MEMCG_MEMORY_USAGE "/dev/memcg/memory.usage_in_bytes"
84 #define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes"
85 #define ZONEINFO_PATH "/proc/zoneinfo"
86 #define MEMINFO_PATH "/proc/meminfo"
87 #define VMSTAT_PATH "/proc/vmstat"
88 #define PROC_STATUS_TGID_FIELD "Tgid:"
89 #define PROC_STATUS_RSS_FIELD "VmRSS:"
90 #define PROC_STATUS_SWAP_FIELD "VmSwap:"
91 #define LINE_MAX 128
92 
93 #define PERCEPTIBLE_APP_ADJ 200
94 
95 /* Android Logger event logtags (see event.logtags) */
96 #define KILLINFO_LOG_TAG 10195355
97 
98 /* gid containing AID_SYSTEM required */
99 #define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree"
100 #define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj"
101 
102 #define ARRAY_SIZE(x)   (sizeof(x) / sizeof(*(x)))
103 #define EIGHT_MEGA (1 << 23)
104 
105 #define TARGET_UPDATE_MIN_INTERVAL_MS 1000
106 #define THRASHING_RESET_INTERVAL_MS 1000
107 
108 #define NS_PER_MS (NS_PER_SEC / MS_PER_SEC)
109 #define US_PER_MS (US_PER_SEC / MS_PER_SEC)
110 
111 /* Defined as ProcessList.SYSTEM_ADJ in ProcessList.java */
112 #define SYSTEM_ADJ (-900)
113 
114 #define STRINGIFY(x) STRINGIFY_INTERNAL(x)
115 #define STRINGIFY_INTERNAL(x) #x
116 
117 /*
118  * Read lmk property with persist.device_config.lmkd_native.<name> overriding ro.lmk.<name>
119  * persist.device_config.lmkd_native.* properties are being set by experiments. If a new property
120  * can be controlled by an experiment then use GET_LMK_PROPERTY instead of property_get_xxx and
121  * add "on property" triggers in lmkd.rc to react to the experiment flag changes.
122  */
123 #define GET_LMK_PROPERTY(type, name, def) \
124     property_get_##type("persist.device_config.lmkd_native." name, \
125         property_get_##type("ro.lmk." name, def))
126 
127 /*
128  * PSI monitor tracking window size.
129  * PSI monitor generates events at most once per window,
130  * therefore we poll memory state for the duration of
131  * PSI_WINDOW_SIZE_MS after the event happens.
132  */
133 #define PSI_WINDOW_SIZE_MS 1000
134 /* Polling period after PSI signal when pressure is high */
135 #define PSI_POLL_PERIOD_SHORT_MS 10
136 /* Polling period after PSI signal when pressure is low */
137 #define PSI_POLL_PERIOD_LONG_MS 100
138 
139 #define min(a, b) (((a) < (b)) ? (a) : (b))
140 #define max(a, b) (((a) > (b)) ? (a) : (b))
141 
142 #define FAIL_REPORT_RLIMIT_MS 1000
143 
144 /*
145  * System property defaults
146  */
147 /* ro.lmk.swap_free_low_percentage property defaults */
148 #define DEF_LOW_SWAP 10
149 /* ro.lmk.thrashing_limit property defaults */
150 #define DEF_THRASHING_LOWRAM 30
151 #define DEF_THRASHING 100
152 /* ro.lmk.thrashing_limit_decay property defaults */
153 #define DEF_THRASHING_DECAY_LOWRAM 50
154 #define DEF_THRASHING_DECAY 10
155 /* ro.lmk.psi_partial_stall_ms property defaults */
156 #define DEF_PARTIAL_STALL_LOWRAM 200
157 #define DEF_PARTIAL_STALL 70
158 /* ro.lmk.psi_complete_stall_ms property defaults */
159 #define DEF_COMPLETE_STALL 700
160 
161 #define LMKD_REINIT_PROP "lmkd.reinit"
162 
163 /* default to old in-kernel interface if no memory pressure events */
164 static bool use_inkernel_interface = true;
165 static bool has_inkernel_module;
166 
167 /* memory pressure levels */
168 enum vmpressure_level {
169     VMPRESS_LEVEL_LOW = 0,
170     VMPRESS_LEVEL_MEDIUM,
171     VMPRESS_LEVEL_CRITICAL,
172     VMPRESS_LEVEL_COUNT
173 };
174 
175 static const char *level_name[] = {
176     "low",
177     "medium",
178     "critical"
179 };
180 
181 struct {
182     int64_t min_nr_free_pages; /* recorded but not used yet */
183     int64_t max_nr_free_pages;
184 } low_pressure_mem = { -1, -1 };
185 
186 struct psi_threshold {
187     enum psi_stall_type stall_type;
188     int threshold_ms;
189 };
190 
191 static int level_oomadj[VMPRESS_LEVEL_COUNT];
192 static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 };
193 static bool pidfd_supported;
194 static int last_kill_pid_or_fd = -1;
195 static struct timespec last_kill_tm;
196 
197 /* lmkd configurable parameters */
198 static bool debug_process_killing;
199 static bool enable_pressure_upgrade;
200 static int64_t upgrade_pressure;
201 static int64_t downgrade_pressure;
202 static bool low_ram_device;
203 static bool kill_heaviest_task;
204 static unsigned long kill_timeout_ms;
205 static bool use_minfree_levels;
206 static bool per_app_memcg;
207 static int swap_free_low_percentage;
208 static int psi_partial_stall_ms;
209 static int psi_complete_stall_ms;
210 static int thrashing_limit_pct;
211 static int thrashing_limit_decay_pct;
212 static int thrashing_critical_pct;
213 static int swap_util_max;
214 static int64_t filecache_min_kb;
215 static int64_t stall_limit_critical;
216 static bool use_psi_monitors = false;
217 static int kpoll_fd;
218 static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
219     { PSI_SOME, 70 },    /* 70ms out of 1sec for partial stall */
220     { PSI_SOME, 100 },   /* 100ms out of 1sec for partial stall */
221     { PSI_FULL, 70 },    /* 70ms out of 1sec for complete stall */
222 };
223 
224 static android_log_context ctx;
225 
226 enum polling_update {
227     POLLING_DO_NOT_CHANGE,
228     POLLING_START,
229     POLLING_PAUSE,
230     POLLING_RESUME,
231 };
232 
233 /*
234  * Data used for periodic polling for the memory state of the device.
235  * Note that when system is not polling poll_handler is set to NULL,
236  * when polling starts poll_handler gets set and is reset back to
237  * NULL when polling stops.
238  */
239 struct polling_params {
240     struct event_handler_info* poll_handler;
241     struct event_handler_info* paused_handler;
242     struct timespec poll_start_tm;
243     struct timespec last_poll_tm;
244     int polling_interval_ms;
245     enum polling_update update;
246 };
247 
248 /* data required to handle events */
249 struct event_handler_info {
250     int data;
251     void (*handler)(int data, uint32_t events, struct polling_params *poll_params);
252 };
253 
254 /* data required to handle socket events */
255 struct sock_event_handler_info {
256     int sock;
257     pid_t pid;
258     uint32_t async_event_mask;
259     struct event_handler_info handler_info;
260 };
261 
262 /* max supported number of data connections (AMS, init, tests) */
263 #define MAX_DATA_CONN 3
264 
265 /* socket event handler data */
266 static struct sock_event_handler_info ctrl_sock;
267 static struct sock_event_handler_info data_sock[MAX_DATA_CONN];
268 
269 /* vmpressure event handler data */
270 static struct event_handler_info vmpressure_hinfo[VMPRESS_LEVEL_COUNT];
271 
272 /*
273  * 1 ctrl listen socket, 3 ctrl data socket, 3 memory pressure levels,
274  * 1 lmk events + 1 fd to wait for process death
275  */
276 #define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1)
277 static int epollfd;
278 static int maxevents;
279 
280 /* OOM score values used by both kernel and framework */
281 #define OOM_SCORE_ADJ_MIN       (-1000)
282 #define OOM_SCORE_ADJ_MAX       1000
283 
284 static int lowmem_adj[MAX_TARGETS];
285 static int lowmem_minfree[MAX_TARGETS];
286 static int lowmem_targets_size;
287 
288 /* Fields to parse in /proc/zoneinfo */
289 /* zoneinfo per-zone fields */
290 enum zoneinfo_zone_field {
291     ZI_ZONE_NR_FREE_PAGES = 0,
292     ZI_ZONE_MIN,
293     ZI_ZONE_LOW,
294     ZI_ZONE_HIGH,
295     ZI_ZONE_PRESENT,
296     ZI_ZONE_NR_FREE_CMA,
297     ZI_ZONE_FIELD_COUNT
298 };
299 
300 static const char* const zoneinfo_zone_field_names[ZI_ZONE_FIELD_COUNT] = {
301     "nr_free_pages",
302     "min",
303     "low",
304     "high",
305     "present",
306     "nr_free_cma",
307 };
308 
309 /* zoneinfo per-zone special fields */
310 enum zoneinfo_zone_spec_field {
311     ZI_ZONE_SPEC_PROTECTION = 0,
312     ZI_ZONE_SPEC_PAGESETS,
313     ZI_ZONE_SPEC_FIELD_COUNT,
314 };
315 
316 static const char* const zoneinfo_zone_spec_field_names[ZI_ZONE_SPEC_FIELD_COUNT] = {
317     "protection:",
318     "pagesets",
319 };
320 
321 /* see __MAX_NR_ZONES definition in kernel mmzone.h */
322 #define MAX_NR_ZONES 6
323 
324 union zoneinfo_zone_fields {
325     struct {
326         int64_t nr_free_pages;
327         int64_t min;
328         int64_t low;
329         int64_t high;
330         int64_t present;
331         int64_t nr_free_cma;
332     } field;
333     int64_t arr[ZI_ZONE_FIELD_COUNT];
334 };
335 
336 struct zoneinfo_zone {
337     union zoneinfo_zone_fields fields;
338     int64_t protection[MAX_NR_ZONES];
339     int64_t max_protection;
340 };
341 
342 /* zoneinfo per-node fields */
343 enum zoneinfo_node_field {
344     ZI_NODE_NR_INACTIVE_FILE = 0,
345     ZI_NODE_NR_ACTIVE_FILE,
346     ZI_NODE_FIELD_COUNT
347 };
348 
349 static const char* const zoneinfo_node_field_names[ZI_NODE_FIELD_COUNT] = {
350     "nr_inactive_file",
351     "nr_active_file",
352 };
353 
354 union zoneinfo_node_fields {
355     struct {
356         int64_t nr_inactive_file;
357         int64_t nr_active_file;
358     } field;
359     int64_t arr[ZI_NODE_FIELD_COUNT];
360 };
361 
362 struct zoneinfo_node {
363     int id;
364     int zone_count;
365     struct zoneinfo_zone zones[MAX_NR_ZONES];
366     union zoneinfo_node_fields fields;
367 };
368 
369 /* for now two memory nodes is more than enough */
370 #define MAX_NR_NODES 2
371 
372 struct zoneinfo {
373     int node_count;
374     struct zoneinfo_node nodes[MAX_NR_NODES];
375     int64_t totalreserve_pages;
376     int64_t total_inactive_file;
377     int64_t total_active_file;
378 };
379 
380 /* Fields to parse in /proc/meminfo */
381 enum meminfo_field {
382     MI_NR_FREE_PAGES = 0,
383     MI_CACHED,
384     MI_SWAP_CACHED,
385     MI_BUFFERS,
386     MI_SHMEM,
387     MI_UNEVICTABLE,
388     MI_TOTAL_SWAP,
389     MI_FREE_SWAP,
390     MI_ACTIVE_ANON,
391     MI_INACTIVE_ANON,
392     MI_ACTIVE_FILE,
393     MI_INACTIVE_FILE,
394     MI_SRECLAIMABLE,
395     MI_SUNRECLAIM,
396     MI_KERNEL_STACK,
397     MI_PAGE_TABLES,
398     MI_ION_HELP,
399     MI_ION_HELP_POOL,
400     MI_CMA_FREE,
401     MI_FIELD_COUNT
402 };
403 
404 static const char* const meminfo_field_names[MI_FIELD_COUNT] = {
405     "MemFree:",
406     "Cached:",
407     "SwapCached:",
408     "Buffers:",
409     "Shmem:",
410     "Unevictable:",
411     "SwapTotal:",
412     "SwapFree:",
413     "Active(anon):",
414     "Inactive(anon):",
415     "Active(file):",
416     "Inactive(file):",
417     "SReclaimable:",
418     "SUnreclaim:",
419     "KernelStack:",
420     "PageTables:",
421     "ION_heap:",
422     "ION_heap_pool:",
423     "CmaFree:",
424 };
425 
426 union meminfo {
427     struct {
428         int64_t nr_free_pages;
429         int64_t cached;
430         int64_t swap_cached;
431         int64_t buffers;
432         int64_t shmem;
433         int64_t unevictable;
434         int64_t total_swap;
435         int64_t free_swap;
436         int64_t active_anon;
437         int64_t inactive_anon;
438         int64_t active_file;
439         int64_t inactive_file;
440         int64_t sreclaimable;
441         int64_t sunreclaimable;
442         int64_t kernel_stack;
443         int64_t page_tables;
444         int64_t ion_heap;
445         int64_t ion_heap_pool;
446         int64_t cma_free;
447         /* fields below are calculated rather than read from the file */
448         int64_t nr_file_pages;
449         int64_t total_gpu_kb;
450     } field;
451     int64_t arr[MI_FIELD_COUNT];
452 };
453 
454 /* Fields to parse in /proc/vmstat */
455 enum vmstat_field {
456     VS_FREE_PAGES,
457     VS_INACTIVE_FILE,
458     VS_ACTIVE_FILE,
459     VS_WORKINGSET_REFAULT,
460     VS_WORKINGSET_REFAULT_FILE,
461     VS_PGSCAN_KSWAPD,
462     VS_PGSCAN_DIRECT,
463     VS_PGSCAN_DIRECT_THROTTLE,
464     VS_FIELD_COUNT
465 };
466 
467 static const char* const vmstat_field_names[MI_FIELD_COUNT] = {
468     "nr_free_pages",
469     "nr_inactive_file",
470     "nr_active_file",
471     "workingset_refault",
472     "workingset_refault_file",
473     "pgscan_kswapd",
474     "pgscan_direct",
475     "pgscan_direct_throttle",
476 };
477 
478 union vmstat {
479     struct {
480         int64_t nr_free_pages;
481         int64_t nr_inactive_file;
482         int64_t nr_active_file;
483         int64_t workingset_refault;
484         int64_t workingset_refault_file;
485         int64_t pgscan_kswapd;
486         int64_t pgscan_direct;
487         int64_t pgscan_direct_throttle;
488     } field;
489     int64_t arr[VS_FIELD_COUNT];
490 };
491 
492 enum field_match_result {
493     NO_MATCH,
494     PARSE_FAIL,
495     PARSE_SUCCESS
496 };
497 
498 struct adjslot_list {
499     struct adjslot_list *next;
500     struct adjslot_list *prev;
501 };
502 
503 struct proc {
504     struct adjslot_list asl;
505     int pid;
506     int pidfd;
507     uid_t uid;
508     int oomadj;
509     pid_t reg_pid; /* PID of the process that registered this record */
510     struct proc *pidhash_next;
511 };
512 
513 struct reread_data {
514     const char* const filename;
515     int fd;
516 };
517 
518 #define PIDHASH_SZ 1024
519 static struct proc *pidhash[PIDHASH_SZ];
520 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
521 
522 #define ADJTOSLOT(adj) ((adj) + -OOM_SCORE_ADJ_MIN)
523 #define ADJTOSLOT_COUNT (ADJTOSLOT(OOM_SCORE_ADJ_MAX) + 1)
524 static struct adjslot_list procadjslot_list[ADJTOSLOT_COUNT];
525 
526 #define MAX_DISTINCT_OOM_ADJ 32
527 #define KILLCNT_INVALID_IDX 0xFF
528 /*
529  * Because killcnt array is sparse a two-level indirection is used
530  * to keep the size small. killcnt_idx stores index of the element in
531  * killcnt array. Index KILLCNT_INVALID_IDX indicates an unused slot.
532  */
533 static uint8_t killcnt_idx[ADJTOSLOT_COUNT];
534 static uint16_t killcnt[MAX_DISTINCT_OOM_ADJ];
535 static int killcnt_free_idx = 0;
536 static uint32_t killcnt_total = 0;
537 
538 /* PAGE_SIZE / 1024 */
539 static long page_k;
540 
541 static void update_props();
542 static bool init_monitors();
543 static void destroy_monitors();
544 
clamp(int low,int high,int value)545 static int clamp(int low, int high, int value) {
546     return max(min(value, high), low);
547 }
548 
parse_int64(const char * str,int64_t * ret)549 static bool parse_int64(const char* str, int64_t* ret) {
550     char* endptr;
551     long long val = strtoll(str, &endptr, 10);
552     if (str == endptr || val > INT64_MAX) {
553         return false;
554     }
555     *ret = (int64_t)val;
556     return true;
557 }
558 
find_field(const char * name,const char * const field_names[],int field_count)559 static int find_field(const char* name, const char* const field_names[], int field_count) {
560     for (int i = 0; i < field_count; i++) {
561         if (!strcmp(name, field_names[i])) {
562             return i;
563         }
564     }
565     return -1;
566 }
567 
match_field(const char * cp,const char * ap,const char * const field_names[],int field_count,int64_t * field,int * field_idx)568 static enum field_match_result match_field(const char* cp, const char* ap,
569                                    const char* const field_names[],
570                                    int field_count, int64_t* field,
571                                    int *field_idx) {
572     int i = find_field(cp, field_names, field_count);
573     if (i < 0) {
574         return NO_MATCH;
575     }
576     *field_idx = i;
577     return parse_int64(ap, field) ? PARSE_SUCCESS : PARSE_FAIL;
578 }
579 
580 /*
581  * Read file content from the beginning up to max_len bytes or EOF
582  * whichever happens first.
583  */
read_all(int fd,char * buf,size_t max_len)584 static ssize_t read_all(int fd, char *buf, size_t max_len)
585 {
586     ssize_t ret = 0;
587     off_t offset = 0;
588 
589     while (max_len > 0) {
590         ssize_t r = TEMP_FAILURE_RETRY(pread(fd, buf, max_len, offset));
591         if (r == 0) {
592             break;
593         }
594         if (r == -1) {
595             return -1;
596         }
597         ret += r;
598         buf += r;
599         offset += r;
600         max_len -= r;
601     }
602 
603     return ret;
604 }
605 
606 /*
607  * Read a new or already opened file from the beginning.
608  * If the file has not been opened yet data->fd should be set to -1.
609  * To be used with files which are read often and possibly during high
610  * memory pressure to minimize file opening which by itself requires kernel
611  * memory allocation and might result in a stall on memory stressed system.
612  */
reread_file(struct reread_data * data)613 static char *reread_file(struct reread_data *data) {
614     /* start with page-size buffer and increase if needed */
615     static ssize_t buf_size = PAGE_SIZE;
616     static char *new_buf, *buf = NULL;
617     ssize_t size;
618 
619     if (data->fd == -1) {
620         /* First-time buffer initialization */
621         if (!buf && (buf = static_cast<char*>(malloc(buf_size))) == nullptr) {
622             return NULL;
623         }
624 
625         data->fd = TEMP_FAILURE_RETRY(open(data->filename, O_RDONLY | O_CLOEXEC));
626         if (data->fd < 0) {
627             ALOGE("%s open: %s", data->filename, strerror(errno));
628             return NULL;
629         }
630     }
631 
632     while (true) {
633         size = read_all(data->fd, buf, buf_size - 1);
634         if (size < 0) {
635             ALOGE("%s read: %s", data->filename, strerror(errno));
636             close(data->fd);
637             data->fd = -1;
638             return NULL;
639         }
640         if (size < buf_size - 1) {
641             break;
642         }
643         /*
644          * Since we are reading /proc files we can't use fstat to find out
645          * the real size of the file. Double the buffer size and keep retrying.
646          */
647         if ((new_buf = static_cast<char*>(realloc(buf, buf_size * 2))) == nullptr) {
648             errno = ENOMEM;
649             return NULL;
650         }
651         buf = new_buf;
652         buf_size *= 2;
653     }
654     buf[size] = 0;
655 
656     return buf;
657 }
658 
claim_record(struct proc * procp,pid_t pid)659 static bool claim_record(struct proc* procp, pid_t pid) {
660     if (procp->reg_pid == pid) {
661         /* Record already belongs to the registrant */
662         return true;
663     }
664     if (procp->reg_pid == 0) {
665         /* Old registrant is gone, claim the record */
666         procp->reg_pid = pid;
667         return true;
668     }
669     /* The record is owned by another registrant */
670     return false;
671 }
672 
remove_claims(pid_t pid)673 static void remove_claims(pid_t pid) {
674     int i;
675 
676     for (i = 0; i < PIDHASH_SZ; i++) {
677         struct proc* procp = pidhash[i];
678         while (procp) {
679             if (procp->reg_pid == pid) {
680                 procp->reg_pid = 0;
681             }
682             procp = procp->pidhash_next;
683         }
684     }
685 }
686 
ctrl_data_close(int dsock_idx)687 static void ctrl_data_close(int dsock_idx) {
688     struct epoll_event epev;
689 
690     ALOGI("closing lmkd data connection");
691     if (epoll_ctl(epollfd, EPOLL_CTL_DEL, data_sock[dsock_idx].sock, &epev) == -1) {
692         // Log a warning and keep going
693         ALOGW("epoll_ctl for data connection socket failed; errno=%d", errno);
694     }
695     maxevents--;
696 
697     close(data_sock[dsock_idx].sock);
698     data_sock[dsock_idx].sock = -1;
699 
700     /* Mark all records of the old registrant as unclaimed */
701     remove_claims(data_sock[dsock_idx].pid);
702 }
703 
ctrl_data_read(int dsock_idx,char * buf,size_t bufsz,struct ucred * sender_cred)704 static ssize_t ctrl_data_read(int dsock_idx, char* buf, size_t bufsz, struct ucred* sender_cred) {
705     struct iovec iov = {buf, bufsz};
706     char control[CMSG_SPACE(sizeof(struct ucred))];
707     struct msghdr hdr = {
708             NULL, 0, &iov, 1, control, sizeof(control), 0,
709     };
710     ssize_t ret;
711     ret = TEMP_FAILURE_RETRY(recvmsg(data_sock[dsock_idx].sock, &hdr, 0));
712     if (ret == -1) {
713         ALOGE("control data socket read failed; %s", strerror(errno));
714         return -1;
715     }
716     if (ret == 0) {
717         ALOGE("Got EOF on control data socket");
718         return -1;
719     }
720 
721     struct ucred* cred = NULL;
722     struct cmsghdr* cmsg = CMSG_FIRSTHDR(&hdr);
723     while (cmsg != NULL) {
724         if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_CREDENTIALS) {
725             cred = (struct ucred*)CMSG_DATA(cmsg);
726             break;
727         }
728         cmsg = CMSG_NXTHDR(&hdr, cmsg);
729     }
730 
731     if (cred == NULL) {
732         ALOGE("Failed to retrieve sender credentials");
733         /* Close the connection */
734         ctrl_data_close(dsock_idx);
735         return -1;
736     }
737 
738     memcpy(sender_cred, cred, sizeof(struct ucred));
739 
740     /* Store PID of the peer */
741     data_sock[dsock_idx].pid = cred->pid;
742 
743     return ret;
744 }
745 
ctrl_data_write(int dsock_idx,char * buf,size_t bufsz)746 static int ctrl_data_write(int dsock_idx, char* buf, size_t bufsz) {
747     int ret = 0;
748 
749     ret = TEMP_FAILURE_RETRY(write(data_sock[dsock_idx].sock, buf, bufsz));
750 
751     if (ret == -1) {
752         ALOGE("control data socket write failed; errno=%d", errno);
753     } else if (ret == 0) {
754         ALOGE("Got EOF on control data socket");
755         ret = -1;
756     }
757 
758     return ret;
759 }
760 
761 /*
762  * Write the pid/uid pair over the data socket, note: all active clients
763  * will receive this unsolicited notification.
764  */
ctrl_data_write_lmk_kill_occurred(pid_t pid,uid_t uid)765 static void ctrl_data_write_lmk_kill_occurred(pid_t pid, uid_t uid) {
766     LMKD_CTRL_PACKET packet;
767     size_t len = lmkd_pack_set_prockills(packet, pid, uid);
768 
769     for (int i = 0; i < MAX_DATA_CONN; i++) {
770         if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_KILL) {
771             ctrl_data_write(i, (char*)packet, len);
772         }
773     }
774 }
775 
776 /*
777  * Write the kill_stat/memory_stat over the data socket to be propagated via AMS to statsd
778  */
stats_write_lmk_kill_occurred(struct kill_stat * kill_st,struct memory_stat * mem_st)779 static void stats_write_lmk_kill_occurred(struct kill_stat *kill_st,
780                                           struct memory_stat *mem_st) {
781     LMK_KILL_OCCURRED_PACKET packet;
782     const size_t len = lmkd_pack_set_kill_occurred(packet, kill_st, mem_st);
783     if (len == 0) {
784         return;
785     }
786 
787     for (int i = 0; i < MAX_DATA_CONN; i++) {
788         if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) {
789             ctrl_data_write(i, packet, len);
790         }
791     }
792 
793 }
794 
stats_write_lmk_kill_occurred_pid(int pid,struct kill_stat * kill_st,struct memory_stat * mem_st)795 static void stats_write_lmk_kill_occurred_pid(int pid, struct kill_stat *kill_st,
796                                               struct memory_stat *mem_st) {
797     kill_st->taskname = stats_get_task_name(pid);
798     if (kill_st->taskname != NULL) {
799         stats_write_lmk_kill_occurred(kill_st, mem_st);
800     }
801 }
802 
803 /*
804  * Write the state_changed over the data socket to be propagated via AMS to statsd
805  */
stats_write_lmk_state_changed(enum lmk_state state)806 static void stats_write_lmk_state_changed(enum lmk_state state) {
807     LMKD_CTRL_PACKET packet_state_changed;
808     const size_t len = lmkd_pack_set_state_changed(packet_state_changed, state);
809     if (len == 0) {
810         return;
811     }
812     for (int i = 0; i < MAX_DATA_CONN; i++) {
813         if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) {
814             ctrl_data_write(i, (char*)packet_state_changed, len);
815         }
816     }
817 }
818 
poll_kernel(int poll_fd)819 static void poll_kernel(int poll_fd) {
820     if (poll_fd == -1) {
821         // not waiting
822         return;
823     }
824 
825     while (1) {
826         char rd_buf[256];
827         int bytes_read = TEMP_FAILURE_RETRY(pread(poll_fd, (void*)rd_buf, sizeof(rd_buf), 0));
828         if (bytes_read <= 0) break;
829         rd_buf[bytes_read] = '\0';
830 
831         int64_t pid;
832         int64_t uid;
833         int64_t group_leader_pid;
834         int64_t rss_in_pages;
835         struct memory_stat mem_st = {};
836         int16_t oom_score_adj;
837         int16_t min_score_adj;
838         int64_t starttime;
839         char* taskname = 0;
840 
841         int fields_read =
842                 sscanf(rd_buf,
843                        "%" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64
844                        " %" SCNd16 " %" SCNd16 " %" SCNd64 "\n%m[^\n]",
845                        &pid, &uid, &group_leader_pid, &mem_st.pgfault, &mem_st.pgmajfault,
846                        &rss_in_pages, &oom_score_adj, &min_score_adj, &starttime, &taskname);
847 
848         /* only the death of the group leader process is logged */
849         if (fields_read == 10 && group_leader_pid == pid) {
850             ctrl_data_write_lmk_kill_occurred((pid_t)pid, (uid_t)uid);
851             mem_st.process_start_time_ns = starttime * (NS_PER_SEC / sysconf(_SC_CLK_TCK));
852             mem_st.rss_in_bytes = rss_in_pages * PAGE_SIZE;
853 
854             struct kill_stat kill_st = {
855                 .uid = static_cast<int32_t>(uid),
856                 .kill_reason = NONE,
857                 .oom_score = oom_score_adj,
858                 .min_oom_score = min_score_adj,
859                 .free_mem_kb = 0,
860                 .free_swap_kb = 0,
861             };
862             stats_write_lmk_kill_occurred_pid(pid, &kill_st, &mem_st);
863         }
864 
865         free(taskname);
866     }
867 }
868 
init_poll_kernel()869 static bool init_poll_kernel() {
870     kpoll_fd = TEMP_FAILURE_RETRY(open("/proc/lowmemorykiller", O_RDONLY | O_NONBLOCK | O_CLOEXEC));
871 
872     if (kpoll_fd < 0) {
873         ALOGE("kernel lmk event file could not be opened; errno=%d", errno);
874         return false;
875     }
876 
877     return true;
878 }
879 
pid_lookup(int pid)880 static struct proc *pid_lookup(int pid) {
881     struct proc *procp;
882 
883     for (procp = pidhash[pid_hashfn(pid)]; procp && procp->pid != pid;
884          procp = procp->pidhash_next)
885             ;
886 
887     return procp;
888 }
889 
adjslot_insert(struct adjslot_list * head,struct adjslot_list * new_element)890 static void adjslot_insert(struct adjslot_list *head, struct adjslot_list *new_element)
891 {
892     struct adjslot_list *next = head->next;
893     new_element->prev = head;
894     new_element->next = next;
895     next->prev = new_element;
896     head->next = new_element;
897 }
898 
adjslot_remove(struct adjslot_list * old)899 static void adjslot_remove(struct adjslot_list *old)
900 {
901     struct adjslot_list *prev = old->prev;
902     struct adjslot_list *next = old->next;
903     next->prev = prev;
904     prev->next = next;
905 }
906 
adjslot_tail(struct adjslot_list * head)907 static struct adjslot_list *adjslot_tail(struct adjslot_list *head) {
908     struct adjslot_list *asl = head->prev;
909 
910     return asl == head ? NULL : asl;
911 }
912 
proc_slot(struct proc * procp)913 static void proc_slot(struct proc *procp) {
914     int adjslot = ADJTOSLOT(procp->oomadj);
915 
916     adjslot_insert(&procadjslot_list[adjslot], &procp->asl);
917 }
918 
proc_unslot(struct proc * procp)919 static void proc_unslot(struct proc *procp) {
920     adjslot_remove(&procp->asl);
921 }
922 
proc_insert(struct proc * procp)923 static void proc_insert(struct proc *procp) {
924     int hval = pid_hashfn(procp->pid);
925 
926     procp->pidhash_next = pidhash[hval];
927     pidhash[hval] = procp;
928     proc_slot(procp);
929 }
930 
pid_remove(int pid)931 static int pid_remove(int pid) {
932     int hval = pid_hashfn(pid);
933     struct proc *procp;
934     struct proc *prevp;
935 
936     for (procp = pidhash[hval], prevp = NULL; procp && procp->pid != pid;
937          procp = procp->pidhash_next)
938             prevp = procp;
939 
940     if (!procp)
941         return -1;
942 
943     if (!prevp)
944         pidhash[hval] = procp->pidhash_next;
945     else
946         prevp->pidhash_next = procp->pidhash_next;
947 
948     proc_unslot(procp);
949     /*
950      * Close pidfd here if we are not waiting for corresponding process to die,
951      * in which case stop_wait_for_proc_kill() will close the pidfd later
952      */
953     if (procp->pidfd >= 0 && procp->pidfd != last_kill_pid_or_fd) {
954         close(procp->pidfd);
955     }
956     free(procp);
957     return 0;
958 }
959 
960 /*
961  * Write a string to a file.
962  * Returns false if the file does not exist.
963  */
writefilestring(const char * path,const char * s,bool err_if_missing)964 static bool writefilestring(const char *path, const char *s,
965                             bool err_if_missing) {
966     int fd = open(path, O_WRONLY | O_CLOEXEC);
967     ssize_t len = strlen(s);
968     ssize_t ret;
969 
970     if (fd < 0) {
971         if (err_if_missing) {
972             ALOGE("Error opening %s; errno=%d", path, errno);
973         }
974         return false;
975     }
976 
977     ret = TEMP_FAILURE_RETRY(write(fd, s, len));
978     if (ret < 0) {
979         ALOGE("Error writing %s; errno=%d", path, errno);
980     } else if (ret < len) {
981         ALOGE("Short write on %s; length=%zd", path, ret);
982     }
983 
984     close(fd);
985     return true;
986 }
987 
get_time_diff_ms(struct timespec * from,struct timespec * to)988 static inline long get_time_diff_ms(struct timespec *from,
989                                     struct timespec *to) {
990     return (to->tv_sec - from->tv_sec) * (long)MS_PER_SEC +
991            (to->tv_nsec - from->tv_nsec) / (long)NS_PER_MS;
992 }
993 
994 /* Reads /proc/pid/status into buf. */
read_proc_status(int pid,char * buf,size_t buf_sz)995 static bool read_proc_status(int pid, char *buf, size_t buf_sz) {
996     char path[PATH_MAX];
997     int fd;
998     ssize_t size;
999 
1000     snprintf(path, PATH_MAX, "/proc/%d/status", pid);
1001     fd = open(path, O_RDONLY | O_CLOEXEC);
1002     if (fd < 0) {
1003         return false;
1004     }
1005 
1006     size = read_all(fd, buf, buf_sz - 1);
1007     close(fd);
1008     if (size < 0) {
1009         return false;
1010     }
1011     buf[size] = 0;
1012     return true;
1013 }
1014 
1015 /* Looks for tag in buf and parses the first integer */
parse_status_tag(char * buf,const char * tag,int64_t * out)1016 static bool parse_status_tag(char *buf, const char *tag, int64_t *out) {
1017     char *pos = buf;
1018     while (true) {
1019         pos = strstr(pos, tag);
1020         /* Stop if tag not found or found at the line beginning */
1021         if (pos == NULL || pos == buf || pos[-1] == '\n') {
1022             break;
1023         }
1024         pos++;
1025     }
1026 
1027     if (pos == NULL) {
1028         return false;
1029     }
1030 
1031     pos += strlen(tag);
1032     while (*pos == ' ') ++pos;
1033     return parse_int64(pos, out);
1034 }
1035 
proc_get_size(int pid)1036 static int proc_get_size(int pid) {
1037     char path[PATH_MAX];
1038     char line[LINE_MAX];
1039     int fd;
1040     int rss = 0;
1041     int total;
1042     ssize_t ret;
1043 
1044     /* gid containing AID_READPROC required */
1045     snprintf(path, PATH_MAX, "/proc/%d/statm", pid);
1046     fd = open(path, O_RDONLY | O_CLOEXEC);
1047     if (fd == -1)
1048         return -1;
1049 
1050     ret = read_all(fd, line, sizeof(line) - 1);
1051     if (ret < 0) {
1052         close(fd);
1053         return -1;
1054     }
1055     line[ret] = '\0';
1056 
1057     sscanf(line, "%d %d ", &total, &rss);
1058     close(fd);
1059     return rss;
1060 }
1061 
proc_get_name(int pid,char * buf,size_t buf_size)1062 static char *proc_get_name(int pid, char *buf, size_t buf_size) {
1063     char path[PATH_MAX];
1064     int fd;
1065     char *cp;
1066     ssize_t ret;
1067 
1068     /* gid containing AID_READPROC required */
1069     snprintf(path, PATH_MAX, "/proc/%d/cmdline", pid);
1070     fd = open(path, O_RDONLY | O_CLOEXEC);
1071     if (fd == -1) {
1072         return NULL;
1073     }
1074     ret = read_all(fd, buf, buf_size - 1);
1075     close(fd);
1076     if (ret < 0) {
1077         return NULL;
1078     }
1079     buf[ret] = '\0';
1080 
1081     cp = strchr(buf, ' ');
1082     if (cp) {
1083         *cp = '\0';
1084     }
1085 
1086     return buf;
1087 }
1088 
cmd_procprio(LMKD_CTRL_PACKET packet,int field_count,struct ucred * cred)1089 static void cmd_procprio(LMKD_CTRL_PACKET packet, int field_count, struct ucred *cred) {
1090     struct proc *procp;
1091     char path[LINE_MAX];
1092     char val[20];
1093     int soft_limit_mult;
1094     struct lmk_procprio params;
1095     bool is_system_server;
1096     struct passwd *pwdrec;
1097     int64_t tgid;
1098     char buf[PAGE_SIZE];
1099 
1100     lmkd_pack_get_procprio(packet, field_count, &params);
1101 
1102     if (params.oomadj < OOM_SCORE_ADJ_MIN ||
1103         params.oomadj > OOM_SCORE_ADJ_MAX) {
1104         ALOGE("Invalid PROCPRIO oomadj argument %d", params.oomadj);
1105         return;
1106     }
1107 
1108     if (params.ptype < PROC_TYPE_FIRST || params.ptype >= PROC_TYPE_COUNT) {
1109         ALOGE("Invalid PROCPRIO process type argument %d", params.ptype);
1110         return;
1111     }
1112 
1113     /* Check if registered process is a thread group leader */
1114     if (read_proc_status(params.pid, buf, sizeof(buf))) {
1115         if (parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid) && tgid != params.pid) {
1116             ALOGE("Attempt to register a task that is not a thread group leader "
1117                   "(tid %d, tgid %" PRId64 ")", params.pid, tgid);
1118             return;
1119         }
1120     }
1121 
1122     /* gid containing AID_READPROC required */
1123     /* CAP_SYS_RESOURCE required */
1124     /* CAP_DAC_OVERRIDE required */
1125     snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", params.pid);
1126     snprintf(val, sizeof(val), "%d", params.oomadj);
1127     if (!writefilestring(path, val, false)) {
1128         ALOGW("Failed to open %s; errno=%d: process %d might have been killed",
1129               path, errno, params.pid);
1130         /* If this file does not exist the process is dead. */
1131         return;
1132     }
1133 
1134     if (use_inkernel_interface) {
1135         stats_store_taskname(params.pid, proc_get_name(params.pid, path, sizeof(path)));
1136         return;
1137     }
1138 
1139     /* lmkd should not change soft limits for services */
1140     if (params.ptype == PROC_TYPE_APP && per_app_memcg) {
1141         if (params.oomadj >= 900) {
1142             soft_limit_mult = 0;
1143         } else if (params.oomadj >= 800) {
1144             soft_limit_mult = 0;
1145         } else if (params.oomadj >= 700) {
1146             soft_limit_mult = 0;
1147         } else if (params.oomadj >= 600) {
1148             // Launcher should be perceptible, don't kill it.
1149             params.oomadj = 200;
1150             soft_limit_mult = 1;
1151         } else if (params.oomadj >= 500) {
1152             soft_limit_mult = 0;
1153         } else if (params.oomadj >= 400) {
1154             soft_limit_mult = 0;
1155         } else if (params.oomadj >= 300) {
1156             soft_limit_mult = 1;
1157         } else if (params.oomadj >= 200) {
1158             soft_limit_mult = 8;
1159         } else if (params.oomadj >= 100) {
1160             soft_limit_mult = 10;
1161         } else if (params.oomadj >=   0) {
1162             soft_limit_mult = 20;
1163         } else {
1164             // Persistent processes will have a large
1165             // soft limit 512MB.
1166             soft_limit_mult = 64;
1167         }
1168 
1169         snprintf(path, sizeof(path), MEMCG_SYSFS_PATH
1170                  "apps/uid_%d/pid_%d/memory.soft_limit_in_bytes",
1171                  params.uid, params.pid);
1172         snprintf(val, sizeof(val), "%d", soft_limit_mult * EIGHT_MEGA);
1173 
1174         /*
1175          * system_server process has no memcg under /dev/memcg/apps but should be
1176          * registered with lmkd. This is the best way so far to identify it.
1177          */
1178         is_system_server = (params.oomadj == SYSTEM_ADJ &&
1179                             (pwdrec = getpwnam("system")) != NULL &&
1180                             params.uid == pwdrec->pw_uid);
1181         writefilestring(path, val, !is_system_server);
1182     }
1183 
1184     procp = pid_lookup(params.pid);
1185     if (!procp) {
1186         int pidfd = -1;
1187 
1188         if (pidfd_supported) {
1189             pidfd = TEMP_FAILURE_RETRY(pidfd_open(params.pid, 0));
1190             if (pidfd < 0) {
1191                 ALOGE("pidfd_open for pid %d failed; errno=%d", params.pid, errno);
1192                 return;
1193             }
1194         }
1195 
1196         procp = static_cast<struct proc*>(calloc(1, sizeof(struct proc)));
1197         if (!procp) {
1198             // Oh, the irony.  May need to rebuild our state.
1199             return;
1200         }
1201 
1202         procp->pid = params.pid;
1203         procp->pidfd = pidfd;
1204         procp->uid = params.uid;
1205         procp->reg_pid = cred->pid;
1206         procp->oomadj = params.oomadj;
1207         proc_insert(procp);
1208     } else {
1209         if (!claim_record(procp, cred->pid)) {
1210             char buf[LINE_MAX];
1211             char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1212             /* Only registrant of the record can remove it */
1213             ALOGE("%s (%d, %d) attempts to modify a process registered by another client",
1214                 taskname ? taskname : "A process ", cred->uid, cred->pid);
1215             return;
1216         }
1217         proc_unslot(procp);
1218         procp->oomadj = params.oomadj;
1219         proc_slot(procp);
1220     }
1221 }
1222 
cmd_procremove(LMKD_CTRL_PACKET packet,struct ucred * cred)1223 static void cmd_procremove(LMKD_CTRL_PACKET packet, struct ucred *cred) {
1224     struct lmk_procremove params;
1225     struct proc *procp;
1226 
1227     lmkd_pack_get_procremove(packet, &params);
1228 
1229     if (use_inkernel_interface) {
1230         /*
1231          * Perform an extra check before the pid is removed, after which it
1232          * will be impossible for poll_kernel to get the taskname. poll_kernel()
1233          * is potentially a long-running blocking function; however this method
1234          * handles AMS requests but does not block AMS.
1235          */
1236         poll_kernel(kpoll_fd);
1237 
1238         stats_remove_taskname(params.pid);
1239         return;
1240     }
1241 
1242     procp = pid_lookup(params.pid);
1243     if (!procp) {
1244         return;
1245     }
1246 
1247     if (!claim_record(procp, cred->pid)) {
1248         char buf[LINE_MAX];
1249         char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1250         /* Only registrant of the record can remove it */
1251         ALOGE("%s (%d, %d) attempts to unregister a process registered by another client",
1252             taskname ? taskname : "A process ", cred->uid, cred->pid);
1253         return;
1254     }
1255 
1256     /*
1257      * WARNING: After pid_remove() procp is freed and can't be used!
1258      * Therefore placed at the end of the function.
1259      */
1260     pid_remove(params.pid);
1261 }
1262 
cmd_procpurge(struct ucred * cred)1263 static void cmd_procpurge(struct ucred *cred) {
1264     int i;
1265     struct proc *procp;
1266     struct proc *next;
1267 
1268     if (use_inkernel_interface) {
1269         stats_purge_tasknames();
1270         return;
1271     }
1272 
1273     for (i = 0; i < PIDHASH_SZ; i++) {
1274         procp = pidhash[i];
1275         while (procp) {
1276             next = procp->pidhash_next;
1277             /* Purge only records created by the requestor */
1278             if (claim_record(procp, cred->pid)) {
1279                 pid_remove(procp->pid);
1280             }
1281             procp = next;
1282         }
1283     }
1284 }
1285 
cmd_subscribe(int dsock_idx,LMKD_CTRL_PACKET packet)1286 static void cmd_subscribe(int dsock_idx, LMKD_CTRL_PACKET packet) {
1287     struct lmk_subscribe params;
1288 
1289     lmkd_pack_get_subscribe(packet, &params);
1290     data_sock[dsock_idx].async_event_mask |= 1 << params.evt_type;
1291 }
1292 
inc_killcnt(int oomadj)1293 static void inc_killcnt(int oomadj) {
1294     int slot = ADJTOSLOT(oomadj);
1295     uint8_t idx = killcnt_idx[slot];
1296 
1297     if (idx == KILLCNT_INVALID_IDX) {
1298         /* index is not assigned for this oomadj */
1299         if (killcnt_free_idx < MAX_DISTINCT_OOM_ADJ) {
1300             killcnt_idx[slot] = killcnt_free_idx;
1301             killcnt[killcnt_free_idx] = 1;
1302             killcnt_free_idx++;
1303         } else {
1304             ALOGW("Number of distinct oomadj levels exceeds %d",
1305                 MAX_DISTINCT_OOM_ADJ);
1306         }
1307     } else {
1308         /*
1309          * wraparound is highly unlikely and is detectable using total
1310          * counter because it has to be equal to the sum of all counters
1311          */
1312         killcnt[idx]++;
1313     }
1314     /* increment total kill counter */
1315     killcnt_total++;
1316 }
1317 
get_killcnt(int min_oomadj,int max_oomadj)1318 static int get_killcnt(int min_oomadj, int max_oomadj) {
1319     int slot;
1320     int count = 0;
1321 
1322     if (min_oomadj > max_oomadj)
1323         return 0;
1324 
1325     /* special case to get total kill count */
1326     if (min_oomadj > OOM_SCORE_ADJ_MAX)
1327         return killcnt_total;
1328 
1329     while (min_oomadj <= max_oomadj &&
1330            (slot = ADJTOSLOT(min_oomadj)) < ADJTOSLOT_COUNT) {
1331         uint8_t idx = killcnt_idx[slot];
1332         if (idx != KILLCNT_INVALID_IDX) {
1333             count += killcnt[idx];
1334         }
1335         min_oomadj++;
1336     }
1337 
1338     return count;
1339 }
1340 
cmd_getkillcnt(LMKD_CTRL_PACKET packet)1341 static int cmd_getkillcnt(LMKD_CTRL_PACKET packet) {
1342     struct lmk_getkillcnt params;
1343 
1344     if (use_inkernel_interface) {
1345         /* kernel driver does not expose this information */
1346         return 0;
1347     }
1348 
1349     lmkd_pack_get_getkillcnt(packet, &params);
1350 
1351     return get_killcnt(params.min_oomadj, params.max_oomadj);
1352 }
1353 
cmd_target(int ntargets,LMKD_CTRL_PACKET packet)1354 static void cmd_target(int ntargets, LMKD_CTRL_PACKET packet) {
1355     int i;
1356     struct lmk_target target;
1357     char minfree_str[PROPERTY_VALUE_MAX];
1358     char *pstr = minfree_str;
1359     char *pend = minfree_str + sizeof(minfree_str);
1360     static struct timespec last_req_tm;
1361     struct timespec curr_tm;
1362 
1363     if (ntargets < 1 || ntargets > (int)ARRAY_SIZE(lowmem_adj))
1364         return;
1365 
1366     /*
1367      * Ratelimit minfree updates to once per TARGET_UPDATE_MIN_INTERVAL_MS
1368      * to prevent DoS attacks
1369      */
1370     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
1371         ALOGE("Failed to get current time");
1372         return;
1373     }
1374 
1375     if (get_time_diff_ms(&last_req_tm, &curr_tm) <
1376         TARGET_UPDATE_MIN_INTERVAL_MS) {
1377         ALOGE("Ignoring frequent updated to lmkd limits");
1378         return;
1379     }
1380 
1381     last_req_tm = curr_tm;
1382 
1383     for (i = 0; i < ntargets; i++) {
1384         lmkd_pack_get_target(packet, i, &target);
1385         lowmem_minfree[i] = target.minfree;
1386         lowmem_adj[i] = target.oom_adj_score;
1387 
1388         pstr += snprintf(pstr, pend - pstr, "%d:%d,", target.minfree,
1389             target.oom_adj_score);
1390         if (pstr >= pend) {
1391             /* if no more space in the buffer then terminate the loop */
1392             pstr = pend;
1393             break;
1394         }
1395     }
1396 
1397     lowmem_targets_size = ntargets;
1398 
1399     /* Override the last extra comma */
1400     pstr[-1] = '\0';
1401     property_set("sys.lmk.minfree_levels", minfree_str);
1402 
1403     if (has_inkernel_module) {
1404         char minfreestr[128];
1405         char killpriostr[128];
1406 
1407         minfreestr[0] = '\0';
1408         killpriostr[0] = '\0';
1409 
1410         for (i = 0; i < lowmem_targets_size; i++) {
1411             char val[40];
1412 
1413             if (i) {
1414                 strlcat(minfreestr, ",", sizeof(minfreestr));
1415                 strlcat(killpriostr, ",", sizeof(killpriostr));
1416             }
1417 
1418             snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_minfree[i] : 0);
1419             strlcat(minfreestr, val, sizeof(minfreestr));
1420             snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_adj[i] : 0);
1421             strlcat(killpriostr, val, sizeof(killpriostr));
1422         }
1423 
1424         writefilestring(INKERNEL_MINFREE_PATH, minfreestr, true);
1425         writefilestring(INKERNEL_ADJ_PATH, killpriostr, true);
1426     }
1427 }
1428 
ctrl_command_handler(int dsock_idx)1429 static void ctrl_command_handler(int dsock_idx) {
1430     LMKD_CTRL_PACKET packet;
1431     struct ucred cred;
1432     int len;
1433     enum lmk_cmd cmd;
1434     int nargs;
1435     int targets;
1436     int kill_cnt;
1437     int result;
1438 
1439     len = ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE, &cred);
1440     if (len <= 0)
1441         return;
1442 
1443     if (len < (int)sizeof(int)) {
1444         ALOGE("Wrong control socket read length len=%d", len);
1445         return;
1446     }
1447 
1448     cmd = lmkd_pack_get_cmd(packet);
1449     nargs = len / sizeof(int) - 1;
1450     if (nargs < 0)
1451         goto wronglen;
1452 
1453     switch(cmd) {
1454     case LMK_TARGET:
1455         targets = nargs / 2;
1456         if (nargs & 0x1 || targets > (int)ARRAY_SIZE(lowmem_adj))
1457             goto wronglen;
1458         cmd_target(targets, packet);
1459         break;
1460     case LMK_PROCPRIO:
1461         /* process type field is optional for backward compatibility */
1462         if (nargs < 3 || nargs > 4)
1463             goto wronglen;
1464         cmd_procprio(packet, nargs, &cred);
1465         break;
1466     case LMK_PROCREMOVE:
1467         if (nargs != 1)
1468             goto wronglen;
1469         cmd_procremove(packet, &cred);
1470         break;
1471     case LMK_PROCPURGE:
1472         if (nargs != 0)
1473             goto wronglen;
1474         cmd_procpurge(&cred);
1475         break;
1476     case LMK_GETKILLCNT:
1477         if (nargs != 2)
1478             goto wronglen;
1479         kill_cnt = cmd_getkillcnt(packet);
1480         len = lmkd_pack_set_getkillcnt_repl(packet, kill_cnt);
1481         if (ctrl_data_write(dsock_idx, (char *)packet, len) != len)
1482             return;
1483         break;
1484     case LMK_SUBSCRIBE:
1485         if (nargs != 1)
1486             goto wronglen;
1487         cmd_subscribe(dsock_idx, packet);
1488         break;
1489     case LMK_PROCKILL:
1490         /* This command code is NOT expected at all */
1491         ALOGE("Received unexpected command code %d", cmd);
1492         break;
1493     case LMK_UPDATE_PROPS:
1494         if (nargs != 0)
1495             goto wronglen;
1496         update_props();
1497         if (!use_inkernel_interface) {
1498             /* Reinitialize monitors to apply new settings */
1499             destroy_monitors();
1500             result = init_monitors() ? 0 : -1;
1501         } else {
1502             result = 0;
1503         }
1504         len = lmkd_pack_set_update_props_repl(packet, result);
1505         if (ctrl_data_write(dsock_idx, (char *)packet, len) != len) {
1506             ALOGE("Failed to report operation results");
1507         }
1508         if (!result) {
1509             ALOGI("Properties reinitilized");
1510         } else {
1511             /* New settings can't be supported, crash to be restarted */
1512             ALOGE("New configuration is not supported. Exiting...");
1513             exit(1);
1514         }
1515         break;
1516     default:
1517         ALOGE("Received unknown command code %d", cmd);
1518         return;
1519     }
1520 
1521     return;
1522 
1523 wronglen:
1524     ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
1525 }
1526 
ctrl_data_handler(int data,uint32_t events,struct polling_params * poll_params __unused)1527 static void ctrl_data_handler(int data, uint32_t events,
1528                               struct polling_params *poll_params __unused) {
1529     if (events & EPOLLIN) {
1530         ctrl_command_handler(data);
1531     }
1532 }
1533 
get_free_dsock()1534 static int get_free_dsock() {
1535     for (int i = 0; i < MAX_DATA_CONN; i++) {
1536         if (data_sock[i].sock < 0) {
1537             return i;
1538         }
1539     }
1540     return -1;
1541 }
1542 
ctrl_connect_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)1543 static void ctrl_connect_handler(int data __unused, uint32_t events __unused,
1544                                  struct polling_params *poll_params __unused) {
1545     struct epoll_event epev;
1546     int free_dscock_idx = get_free_dsock();
1547 
1548     if (free_dscock_idx < 0) {
1549         /*
1550          * Number of data connections exceeded max supported. This should not
1551          * happen but if it does we drop all existing connections and accept
1552          * the new one. This prevents inactive connections from monopolizing
1553          * data socket and if we drop ActivityManager connection it will
1554          * immediately reconnect.
1555          */
1556         for (int i = 0; i < MAX_DATA_CONN; i++) {
1557             ctrl_data_close(i);
1558         }
1559         free_dscock_idx = 0;
1560     }
1561 
1562     data_sock[free_dscock_idx].sock = accept(ctrl_sock.sock, NULL, NULL);
1563     if (data_sock[free_dscock_idx].sock < 0) {
1564         ALOGE("lmkd control socket accept failed; errno=%d", errno);
1565         return;
1566     }
1567 
1568     ALOGI("lmkd data connection established");
1569     /* use data to store data connection idx */
1570     data_sock[free_dscock_idx].handler_info.data = free_dscock_idx;
1571     data_sock[free_dscock_idx].handler_info.handler = ctrl_data_handler;
1572     data_sock[free_dscock_idx].async_event_mask = 0;
1573     epev.events = EPOLLIN;
1574     epev.data.ptr = (void *)&(data_sock[free_dscock_idx].handler_info);
1575     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, data_sock[free_dscock_idx].sock, &epev) == -1) {
1576         ALOGE("epoll_ctl for data connection socket failed; errno=%d", errno);
1577         ctrl_data_close(free_dscock_idx);
1578         return;
1579     }
1580     maxevents++;
1581 }
1582 
1583 /*
1584  * /proc/zoneinfo parsing routines
1585  * Expected file format is:
1586  *
1587  *   Node <node_id>, zone   <zone_name>
1588  *   (
1589  *    per-node stats
1590  *       (<per-node field name> <value>)+
1591  *   )?
1592  *   (pages free     <value>
1593  *       (<per-zone field name> <value>)+
1594  *    pagesets
1595  *       (<unused fields>)*
1596  *   )+
1597  *   ...
1598  */
zoneinfo_parse_protection(char * buf,struct zoneinfo_zone * zone)1599 static void zoneinfo_parse_protection(char *buf, struct zoneinfo_zone *zone) {
1600     int zone_idx;
1601     int64_t max = 0;
1602     char *save_ptr;
1603 
1604     for (buf = strtok_r(buf, "(), ", &save_ptr), zone_idx = 0;
1605          buf && zone_idx < MAX_NR_ZONES;
1606          buf = strtok_r(NULL, "), ", &save_ptr), zone_idx++) {
1607         long long zoneval = strtoll(buf, &buf, 0);
1608         if (zoneval > max) {
1609             max = (zoneval > INT64_MAX) ? INT64_MAX : zoneval;
1610         }
1611         zone->protection[zone_idx] = zoneval;
1612     }
1613     zone->max_protection = max;
1614 }
1615 
zoneinfo_parse_zone(char ** buf,struct zoneinfo_zone * zone)1616 static int zoneinfo_parse_zone(char **buf, struct zoneinfo_zone *zone) {
1617     for (char *line = strtok_r(NULL, "\n", buf); line;
1618          line = strtok_r(NULL, "\n", buf)) {
1619         char *cp;
1620         char *ap;
1621         char *save_ptr;
1622         int64_t val;
1623         int field_idx;
1624         enum field_match_result match_res;
1625 
1626         cp = strtok_r(line, " ", &save_ptr);
1627         if (!cp) {
1628             return false;
1629         }
1630 
1631         field_idx = find_field(cp, zoneinfo_zone_spec_field_names, ZI_ZONE_SPEC_FIELD_COUNT);
1632         if (field_idx >= 0) {
1633             /* special field */
1634             if (field_idx == ZI_ZONE_SPEC_PAGESETS) {
1635                 /* no mode fields we are interested in */
1636                 return true;
1637             }
1638 
1639             /* protection field */
1640             ap = strtok_r(NULL, ")", &save_ptr);
1641             if (ap) {
1642                 zoneinfo_parse_protection(ap, zone);
1643             }
1644             continue;
1645         }
1646 
1647         ap = strtok_r(NULL, " ", &save_ptr);
1648         if (!ap) {
1649             continue;
1650         }
1651 
1652         match_res = match_field(cp, ap, zoneinfo_zone_field_names, ZI_ZONE_FIELD_COUNT,
1653             &val, &field_idx);
1654         if (match_res == PARSE_FAIL) {
1655             return false;
1656         }
1657         if (match_res == PARSE_SUCCESS) {
1658             zone->fields.arr[field_idx] = val;
1659         }
1660         if (field_idx == ZI_ZONE_PRESENT && val == 0) {
1661             /* zone is not populated, stop parsing it */
1662             return true;
1663         }
1664     }
1665     return false;
1666 }
1667 
zoneinfo_parse_node(char ** buf,struct zoneinfo_node * node)1668 static int zoneinfo_parse_node(char **buf, struct zoneinfo_node *node) {
1669     int fields_to_match = ZI_NODE_FIELD_COUNT;
1670 
1671     for (char *line = strtok_r(NULL, "\n", buf); line;
1672          line = strtok_r(NULL, "\n", buf)) {
1673         char *cp;
1674         char *ap;
1675         char *save_ptr;
1676         int64_t val;
1677         int field_idx;
1678         enum field_match_result match_res;
1679 
1680         cp = strtok_r(line, " ", &save_ptr);
1681         if (!cp) {
1682             return false;
1683         }
1684 
1685         ap = strtok_r(NULL, " ", &save_ptr);
1686         if (!ap) {
1687             return false;
1688         }
1689 
1690         match_res = match_field(cp, ap, zoneinfo_node_field_names, ZI_NODE_FIELD_COUNT,
1691             &val, &field_idx);
1692         if (match_res == PARSE_FAIL) {
1693             return false;
1694         }
1695         if (match_res == PARSE_SUCCESS) {
1696             node->fields.arr[field_idx] = val;
1697             fields_to_match--;
1698             if (!fields_to_match) {
1699                 return true;
1700             }
1701         }
1702     }
1703     return false;
1704 }
1705 
zoneinfo_parse(struct zoneinfo * zi)1706 static int zoneinfo_parse(struct zoneinfo *zi) {
1707     static struct reread_data file_data = {
1708         .filename = ZONEINFO_PATH,
1709         .fd = -1,
1710     };
1711     char *buf;
1712     char *save_ptr;
1713     char *line;
1714     char zone_name[LINE_MAX + 1];
1715     struct zoneinfo_node *node = NULL;
1716     int node_idx = 0;
1717     int zone_idx = 0;
1718 
1719     memset(zi, 0, sizeof(struct zoneinfo));
1720 
1721     if ((buf = reread_file(&file_data)) == NULL) {
1722         return -1;
1723     }
1724 
1725     for (line = strtok_r(buf, "\n", &save_ptr); line;
1726          line = strtok_r(NULL, "\n", &save_ptr)) {
1727         int node_id;
1728         if (sscanf(line, "Node %d, zone %" STRINGIFY(LINE_MAX) "s", &node_id, zone_name) == 2) {
1729             if (!node || node->id != node_id) {
1730                 /* new node is found */
1731                 if (node) {
1732                     node->zone_count = zone_idx + 1;
1733                     node_idx++;
1734                     if (node_idx == MAX_NR_NODES) {
1735                         /* max node count exceeded */
1736                         ALOGE("%s parse error", file_data.filename);
1737                         return -1;
1738                     }
1739                 }
1740                 node = &zi->nodes[node_idx];
1741                 node->id = node_id;
1742                 zone_idx = 0;
1743                 if (!zoneinfo_parse_node(&save_ptr, node)) {
1744                     ALOGE("%s parse error", file_data.filename);
1745                     return -1;
1746                 }
1747             } else {
1748                 /* new zone is found */
1749                 zone_idx++;
1750             }
1751             if (!zoneinfo_parse_zone(&save_ptr, &node->zones[zone_idx])) {
1752                 ALOGE("%s parse error", file_data.filename);
1753                 return -1;
1754             }
1755         }
1756     }
1757     if (!node) {
1758         ALOGE("%s parse error", file_data.filename);
1759         return -1;
1760     }
1761     node->zone_count = zone_idx + 1;
1762     zi->node_count = node_idx + 1;
1763 
1764     /* calculate totals fields */
1765     for (node_idx = 0; node_idx < zi->node_count; node_idx++) {
1766         node = &zi->nodes[node_idx];
1767         for (zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
1768             struct zoneinfo_zone *zone = &zi->nodes[node_idx].zones[zone_idx];
1769             zi->totalreserve_pages += zone->max_protection + zone->fields.field.high;
1770         }
1771         zi->total_inactive_file += node->fields.field.nr_inactive_file;
1772         zi->total_active_file += node->fields.field.nr_active_file;
1773     }
1774     return 0;
1775 }
1776 
1777 /* /proc/meminfo parsing routines */
meminfo_parse_line(char * line,union meminfo * mi)1778 static bool meminfo_parse_line(char *line, union meminfo *mi) {
1779     char *cp = line;
1780     char *ap;
1781     char *save_ptr;
1782     int64_t val;
1783     int field_idx;
1784     enum field_match_result match_res;
1785 
1786     cp = strtok_r(line, " ", &save_ptr);
1787     if (!cp) {
1788         return false;
1789     }
1790 
1791     ap = strtok_r(NULL, " ", &save_ptr);
1792     if (!ap) {
1793         return false;
1794     }
1795 
1796     match_res = match_field(cp, ap, meminfo_field_names, MI_FIELD_COUNT,
1797         &val, &field_idx);
1798     if (match_res == PARSE_SUCCESS) {
1799         mi->arr[field_idx] = val / page_k;
1800     }
1801     return (match_res != PARSE_FAIL);
1802 }
1803 
read_gpu_total_kb()1804 static int64_t read_gpu_total_kb() {
1805     static int fd = android::bpf::bpfFdGet(
1806             "/sys/fs/bpf/map_gpu_mem_gpu_mem_total_map", BPF_F_RDONLY);
1807     static constexpr uint64_t kBpfKeyGpuTotalUsage = 0;
1808     uint64_t value;
1809 
1810     if (fd < 0) {
1811         return 0;
1812     }
1813 
1814     return android::bpf::findMapEntry(fd, &kBpfKeyGpuTotalUsage, &value)
1815             ? 0
1816             : (int32_t)(value / 1024);
1817 }
1818 
meminfo_parse(union meminfo * mi)1819 static int meminfo_parse(union meminfo *mi) {
1820     static struct reread_data file_data = {
1821         .filename = MEMINFO_PATH,
1822         .fd = -1,
1823     };
1824     char *buf;
1825     char *save_ptr;
1826     char *line;
1827 
1828     memset(mi, 0, sizeof(union meminfo));
1829 
1830     if ((buf = reread_file(&file_data)) == NULL) {
1831         return -1;
1832     }
1833 
1834     for (line = strtok_r(buf, "\n", &save_ptr); line;
1835          line = strtok_r(NULL, "\n", &save_ptr)) {
1836         if (!meminfo_parse_line(line, mi)) {
1837             ALOGE("%s parse error", file_data.filename);
1838             return -1;
1839         }
1840     }
1841     mi->field.nr_file_pages = mi->field.cached + mi->field.swap_cached +
1842         mi->field.buffers;
1843     mi->field.total_gpu_kb = read_gpu_total_kb();
1844 
1845     return 0;
1846 }
1847 
1848 /* /proc/vmstat parsing routines */
vmstat_parse_line(char * line,union vmstat * vs)1849 static bool vmstat_parse_line(char *line, union vmstat *vs) {
1850     char *cp;
1851     char *ap;
1852     char *save_ptr;
1853     int64_t val;
1854     int field_idx;
1855     enum field_match_result match_res;
1856 
1857     cp = strtok_r(line, " ", &save_ptr);
1858     if (!cp) {
1859         return false;
1860     }
1861 
1862     ap = strtok_r(NULL, " ", &save_ptr);
1863     if (!ap) {
1864         return false;
1865     }
1866 
1867     match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT,
1868         &val, &field_idx);
1869     if (match_res == PARSE_SUCCESS) {
1870         vs->arr[field_idx] = val;
1871     }
1872     return (match_res != PARSE_FAIL);
1873 }
1874 
vmstat_parse(union vmstat * vs)1875 static int vmstat_parse(union vmstat *vs) {
1876     static struct reread_data file_data = {
1877         .filename = VMSTAT_PATH,
1878         .fd = -1,
1879     };
1880     char *buf;
1881     char *save_ptr;
1882     char *line;
1883 
1884     memset(vs, 0, sizeof(union vmstat));
1885 
1886     if ((buf = reread_file(&file_data)) == NULL) {
1887         return -1;
1888     }
1889 
1890     for (line = strtok_r(buf, "\n", &save_ptr); line;
1891          line = strtok_r(NULL, "\n", &save_ptr)) {
1892         if (!vmstat_parse_line(line, vs)) {
1893             ALOGE("%s parse error", file_data.filename);
1894             return -1;
1895         }
1896     }
1897 
1898     return 0;
1899 }
1900 
psi_parse(struct reread_data * file_data,struct psi_stats stats[],bool full)1901 static int psi_parse(struct reread_data *file_data, struct psi_stats stats[], bool full) {
1902     char *buf;
1903     char *save_ptr;
1904     char *line;
1905 
1906     if ((buf = reread_file(file_data)) == NULL) {
1907         return -1;
1908     }
1909 
1910     line = strtok_r(buf, "\n", &save_ptr);
1911     if (parse_psi_line(line, PSI_SOME, stats)) {
1912         return -1;
1913     }
1914     if (full) {
1915         line = strtok_r(NULL, "\n", &save_ptr);
1916         if (parse_psi_line(line, PSI_FULL, stats)) {
1917             return -1;
1918         }
1919     }
1920 
1921     return 0;
1922 }
1923 
psi_parse_mem(struct psi_data * psi_data)1924 static int psi_parse_mem(struct psi_data *psi_data) {
1925     static struct reread_data file_data = {
1926         .filename = PSI_PATH_MEMORY,
1927         .fd = -1,
1928     };
1929     return psi_parse(&file_data, psi_data->mem_stats, true);
1930 }
1931 
psi_parse_io(struct psi_data * psi_data)1932 static int psi_parse_io(struct psi_data *psi_data) {
1933     static struct reread_data file_data = {
1934         .filename = PSI_PATH_IO,
1935         .fd = -1,
1936     };
1937     return psi_parse(&file_data, psi_data->io_stats, true);
1938 }
1939 
psi_parse_cpu(struct psi_data * psi_data)1940 static int psi_parse_cpu(struct psi_data *psi_data) {
1941     static struct reread_data file_data = {
1942         .filename = PSI_PATH_CPU,
1943         .fd = -1,
1944     };
1945     return psi_parse(&file_data, psi_data->cpu_stats, false);
1946 }
1947 
1948 enum wakeup_reason {
1949     Event,
1950     Polling
1951 };
1952 
1953 struct wakeup_info {
1954     struct timespec wakeup_tm;
1955     struct timespec prev_wakeup_tm;
1956     struct timespec last_event_tm;
1957     int wakeups_since_event;
1958     int skipped_wakeups;
1959 };
1960 
1961 /*
1962  * After the initial memory pressure event is received lmkd schedules periodic wakeups to check
1963  * the memory conditions and kill if needed (polling). This is done because pressure events are
1964  * rate-limited and memory conditions can change in between events. Therefore after the initial
1965  * event there might be multiple wakeups. This function records the wakeup information such as the
1966  * timestamps of the last event and the last wakeup, the number of wakeups since the last event
1967  * and how many of those wakeups were skipped (some wakeups are skipped if previously killed
1968  * process is still freeing its memory).
1969  */
record_wakeup_time(struct timespec * tm,enum wakeup_reason reason,struct wakeup_info * wi)1970 static void record_wakeup_time(struct timespec *tm, enum wakeup_reason reason,
1971                                struct wakeup_info *wi) {
1972     wi->prev_wakeup_tm = wi->wakeup_tm;
1973     wi->wakeup_tm = *tm;
1974     if (reason == Event) {
1975         wi->last_event_tm = *tm;
1976         wi->wakeups_since_event = 0;
1977         wi->skipped_wakeups = 0;
1978     } else {
1979         wi->wakeups_since_event++;
1980     }
1981 }
1982 
killinfo_log(struct proc * procp,int min_oom_score,int rss_kb,int swap_kb,int kill_reason,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm,struct psi_data * pd)1983 static void killinfo_log(struct proc* procp, int min_oom_score, int rss_kb,
1984                          int swap_kb, int kill_reason, union meminfo *mi,
1985                          struct wakeup_info *wi, struct timespec *tm,
1986                          struct psi_data *pd) {
1987     /* log process information */
1988     android_log_write_int32(ctx, procp->pid);
1989     android_log_write_int32(ctx, procp->uid);
1990     android_log_write_int32(ctx, procp->oomadj);
1991     android_log_write_int32(ctx, min_oom_score);
1992     android_log_write_int32(ctx, (int32_t)min(rss_kb, INT32_MAX));
1993     android_log_write_int32(ctx, kill_reason);
1994 
1995     /* log meminfo fields */
1996     for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) {
1997         android_log_write_int32(ctx, (int32_t)min(mi->arr[field_idx] * page_k, INT32_MAX));
1998     }
1999 
2000     /* log lmkd wakeup information */
2001     android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->last_event_tm, tm));
2002     android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->prev_wakeup_tm, tm));
2003     android_log_write_int32(ctx, wi->wakeups_since_event);
2004     android_log_write_int32(ctx, wi->skipped_wakeups);
2005     android_log_write_int32(ctx, (int32_t)min(swap_kb, INT32_MAX));
2006     android_log_write_int32(ctx, (int32_t)mi->field.total_gpu_kb);
2007 
2008     if (pd) {
2009         android_log_write_float32(ctx, pd->mem_stats[PSI_SOME].avg10);
2010         android_log_write_float32(ctx, pd->mem_stats[PSI_FULL].avg10);
2011         android_log_write_float32(ctx, pd->io_stats[PSI_SOME].avg10);
2012         android_log_write_float32(ctx, pd->io_stats[PSI_FULL].avg10);
2013         android_log_write_float32(ctx, pd->cpu_stats[PSI_SOME].avg10);
2014     } else {
2015         for (int i = 0; i < 5; i++) {
2016             android_log_write_float32(ctx, 0);
2017         }
2018     }
2019 
2020     android_log_write_list(ctx, LOG_ID_EVENTS);
2021     android_log_reset(ctx);
2022 }
2023 
proc_adj_lru(int oomadj)2024 static struct proc *proc_adj_lru(int oomadj) {
2025     return (struct proc *)adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
2026 }
2027 
proc_get_heaviest(int oomadj)2028 static struct proc *proc_get_heaviest(int oomadj) {
2029     struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
2030     struct adjslot_list *curr = head->next;
2031     struct proc *maxprocp = NULL;
2032     int maxsize = 0;
2033     while (curr != head) {
2034         int pid = ((struct proc *)curr)->pid;
2035         int tasksize = proc_get_size(pid);
2036         if (tasksize < 0) {
2037             struct adjslot_list *next = curr->next;
2038             pid_remove(pid);
2039             curr = next;
2040         } else {
2041             if (tasksize > maxsize) {
2042                 maxsize = tasksize;
2043                 maxprocp = (struct proc *)curr;
2044             }
2045             curr = curr->next;
2046         }
2047     }
2048     return maxprocp;
2049 }
2050 
set_process_group_and_prio(int pid,const std::vector<std::string> & profiles,int prio)2051 static void set_process_group_and_prio(int pid, const std::vector<std::string>& profiles,
2052                                        int prio) {
2053     DIR* d;
2054     char proc_path[PATH_MAX];
2055     struct dirent* de;
2056 
2057     snprintf(proc_path, sizeof(proc_path), "/proc/%d/task", pid);
2058     if (!(d = opendir(proc_path))) {
2059         ALOGW("Failed to open %s; errno=%d: process pid(%d) might have died", proc_path, errno,
2060               pid);
2061         return;
2062     }
2063 
2064     while ((de = readdir(d))) {
2065         int t_pid;
2066 
2067         if (de->d_name[0] == '.') continue;
2068         t_pid = atoi(de->d_name);
2069 
2070         if (!t_pid) {
2071             ALOGW("Failed to get t_pid for '%s' of pid(%d)", de->d_name, pid);
2072             continue;
2073         }
2074 
2075         if (setpriority(PRIO_PROCESS, t_pid, prio) && errno != ESRCH) {
2076             ALOGW("Unable to raise priority of killing t_pid (%d): errno=%d", t_pid, errno);
2077         }
2078 
2079         if (!SetTaskProfiles(t_pid, profiles, true)) {
2080             ALOGW("Failed to set task_profiles on pid(%d) t_pid(%d)", pid, t_pid);
2081             continue;
2082         }
2083     }
2084     closedir(d);
2085 }
2086 
is_kill_pending(void)2087 static bool is_kill_pending(void) {
2088     char buf[24];
2089 
2090     if (last_kill_pid_or_fd < 0) {
2091         return false;
2092     }
2093 
2094     if (pidfd_supported) {
2095         return true;
2096     }
2097 
2098     /* when pidfd is not supported base the decision on /proc/<pid> existence */
2099     snprintf(buf, sizeof(buf), "/proc/%d/", last_kill_pid_or_fd);
2100     if (access(buf, F_OK) == 0) {
2101         return true;
2102     }
2103 
2104     return false;
2105 }
2106 
is_waiting_for_kill(void)2107 static bool is_waiting_for_kill(void) {
2108     return pidfd_supported && last_kill_pid_or_fd >= 0;
2109 }
2110 
stop_wait_for_proc_kill(bool finished)2111 static void stop_wait_for_proc_kill(bool finished) {
2112     struct epoll_event epev;
2113 
2114     if (last_kill_pid_or_fd < 0) {
2115         return;
2116     }
2117 
2118     if (debug_process_killing) {
2119         struct timespec curr_tm;
2120 
2121         if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2122             /*
2123              * curr_tm is used here merely to report kill duration, so this failure is not fatal.
2124              * Log an error and continue.
2125              */
2126             ALOGE("Failed to get current time");
2127         }
2128 
2129         if (finished) {
2130             ALOGI("Process got killed in %ldms",
2131                 get_time_diff_ms(&last_kill_tm, &curr_tm));
2132         } else {
2133             ALOGI("Stop waiting for process kill after %ldms",
2134                 get_time_diff_ms(&last_kill_tm, &curr_tm));
2135         }
2136     }
2137 
2138     if (pidfd_supported) {
2139         /* unregister fd */
2140         if (epoll_ctl(epollfd, EPOLL_CTL_DEL, last_kill_pid_or_fd, &epev)) {
2141             // Log an error and keep going
2142             ALOGE("epoll_ctl for last killed process failed; errno=%d", errno);
2143         }
2144         maxevents--;
2145         close(last_kill_pid_or_fd);
2146     }
2147 
2148     last_kill_pid_or_fd = -1;
2149 }
2150 
kill_done_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params)2151 static void kill_done_handler(int data __unused, uint32_t events __unused,
2152                               struct polling_params *poll_params) {
2153     stop_wait_for_proc_kill(true);
2154     poll_params->update = POLLING_RESUME;
2155 }
2156 
start_wait_for_proc_kill(int pid_or_fd)2157 static void start_wait_for_proc_kill(int pid_or_fd) {
2158     static struct event_handler_info kill_done_hinfo = { 0, kill_done_handler };
2159     struct epoll_event epev;
2160 
2161     if (last_kill_pid_or_fd >= 0) {
2162         /* Should not happen but if it does we should stop previous wait */
2163         ALOGE("Attempt to wait for a kill while another wait is in progress");
2164         stop_wait_for_proc_kill(false);
2165     }
2166 
2167     last_kill_pid_or_fd = pid_or_fd;
2168 
2169     if (!pidfd_supported) {
2170         /* If pidfd is not supported just store PID and exit */
2171         return;
2172     }
2173 
2174     epev.events = EPOLLIN;
2175     epev.data.ptr = (void *)&kill_done_hinfo;
2176     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, last_kill_pid_or_fd, &epev) != 0) {
2177         ALOGE("epoll_ctl for last kill failed; errno=%d", errno);
2178         close(last_kill_pid_or_fd);
2179         last_kill_pid_or_fd = -1;
2180         return;
2181     }
2182     maxevents++;
2183 }
2184 
2185 struct kill_info {
2186     enum kill_reasons kill_reason;
2187     const char *kill_desc;
2188     int thrashing;
2189     int max_thrashing;
2190 };
2191 
2192 /* Kill one process specified by procp.  Returns the size (in pages) of the process killed */
kill_one_process(struct proc * procp,int min_oom_score,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm,struct psi_data * pd)2193 static int kill_one_process(struct proc* procp, int min_oom_score, struct kill_info *ki,
2194                             union meminfo *mi, struct wakeup_info *wi, struct timespec *tm,
2195                             struct psi_data *pd) {
2196     int pid = procp->pid;
2197     int pidfd = procp->pidfd;
2198     uid_t uid = procp->uid;
2199     char *taskname;
2200     int r;
2201     int result = -1;
2202     struct memory_stat *mem_st;
2203     struct kill_stat kill_st;
2204     int64_t tgid;
2205     int64_t rss_kb;
2206     int64_t swap_kb;
2207     char buf[PAGE_SIZE];
2208 
2209     if (!read_proc_status(pid, buf, sizeof(buf))) {
2210         goto out;
2211     }
2212     if (!parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid)) {
2213         ALOGE("Unable to parse tgid from /proc/%d/status", pid);
2214         goto out;
2215     }
2216     if (tgid != pid) {
2217         ALOGE("Possible pid reuse detected (pid %d, tgid %" PRId64 ")!", pid, tgid);
2218         goto out;
2219     }
2220     // Zombie processes will not have RSS / Swap fields.
2221     if (!parse_status_tag(buf, PROC_STATUS_RSS_FIELD, &rss_kb)) {
2222         goto out;
2223     }
2224     if (!parse_status_tag(buf, PROC_STATUS_SWAP_FIELD, &swap_kb)) {
2225         goto out;
2226     }
2227 
2228     taskname = proc_get_name(pid, buf, sizeof(buf));
2229     // taskname will point inside buf, do not reuse buf onwards.
2230     if (!taskname) {
2231         goto out;
2232     }
2233 
2234     mem_st = stats_read_memory_stat(per_app_memcg, pid, uid, rss_kb * 1024, swap_kb * 1024);
2235 
2236     TRACE_KILL_START(pid);
2237 
2238     /* CAP_KILL required */
2239     if (pidfd < 0) {
2240         start_wait_for_proc_kill(pid);
2241         r = kill(pid, SIGKILL);
2242     } else {
2243         start_wait_for_proc_kill(pidfd);
2244         r = pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
2245     }
2246 
2247     TRACE_KILL_END();
2248 
2249     if (r) {
2250         stop_wait_for_proc_kill(false);
2251         ALOGE("kill(%d): errno=%d", pid, errno);
2252         /* Delete process record even when we fail to kill so that we don't get stuck on it */
2253         goto out;
2254     }
2255 
2256     set_process_group_and_prio(pid, {"CPUSET_SP_FOREGROUND", "SCHED_SP_FOREGROUND"},
2257                                ANDROID_PRIORITY_HIGHEST);
2258 
2259     last_kill_tm = *tm;
2260 
2261     inc_killcnt(procp->oomadj);
2262 
2263     if (ki) {
2264         kill_st.kill_reason = ki->kill_reason;
2265         kill_st.thrashing = ki->thrashing;
2266         kill_st.max_thrashing = ki->max_thrashing;
2267         killinfo_log(procp, min_oom_score, rss_kb, swap_kb, ki->kill_reason, mi, wi, tm, pd);
2268         ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
2269               "kB swap; reason: %s", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb,
2270               ki->kill_desc);
2271     } else {
2272         kill_st.kill_reason = NONE;
2273         kill_st.thrashing = 0;
2274         kill_st.max_thrashing = 0;
2275         killinfo_log(procp, min_oom_score, rss_kb, swap_kb, NONE, mi, wi, tm, pd);
2276         ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
2277               "kb swap", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb);
2278     }
2279 
2280     kill_st.uid = static_cast<int32_t>(uid);
2281     kill_st.taskname = taskname;
2282     kill_st.oom_score = procp->oomadj;
2283     kill_st.min_oom_score = min_oom_score;
2284     kill_st.free_mem_kb = mi->field.nr_free_pages * page_k;
2285     kill_st.free_swap_kb = mi->field.free_swap * page_k;
2286     stats_write_lmk_kill_occurred(&kill_st, mem_st);
2287 
2288     ctrl_data_write_lmk_kill_occurred((pid_t)pid, uid);
2289 
2290     result = rss_kb / page_k;
2291 
2292 out:
2293     /*
2294      * WARNING: After pid_remove() procp is freed and can't be used!
2295      * Therefore placed at the end of the function.
2296      */
2297     pid_remove(pid);
2298     return result;
2299 }
2300 
2301 /*
2302  * Find one process to kill at or above the given oom_score_adj level.
2303  * Returns size of the killed process.
2304  */
find_and_kill_process(int min_score_adj,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm,struct psi_data * pd)2305 static int find_and_kill_process(int min_score_adj, struct kill_info *ki, union meminfo *mi,
2306                                  struct wakeup_info *wi, struct timespec *tm,
2307                                  struct psi_data *pd) {
2308     int i;
2309     int killed_size = 0;
2310     bool lmk_state_change_start = false;
2311     bool choose_heaviest_task = kill_heaviest_task;
2312 
2313     for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
2314         struct proc *procp;
2315 
2316         if (!choose_heaviest_task && i <= PERCEPTIBLE_APP_ADJ) {
2317             /*
2318              * If we have to choose a perceptible process, choose the heaviest one to
2319              * hopefully minimize the number of victims.
2320              */
2321             choose_heaviest_task = true;
2322         }
2323 
2324         while (true) {
2325             procp = choose_heaviest_task ?
2326                 proc_get_heaviest(i) : proc_adj_lru(i);
2327 
2328             if (!procp)
2329                 break;
2330 
2331             killed_size = kill_one_process(procp, min_score_adj, ki, mi, wi, tm, pd);
2332             if (killed_size >= 0) {
2333                 if (!lmk_state_change_start) {
2334                     lmk_state_change_start = true;
2335                     stats_write_lmk_state_changed(STATE_START);
2336                 }
2337                 break;
2338             }
2339         }
2340         if (killed_size) {
2341             break;
2342         }
2343     }
2344 
2345     if (lmk_state_change_start) {
2346         stats_write_lmk_state_changed(STATE_STOP);
2347     }
2348 
2349     return killed_size;
2350 }
2351 
get_memory_usage(struct reread_data * file_data)2352 static int64_t get_memory_usage(struct reread_data *file_data) {
2353     int64_t mem_usage;
2354     char *buf;
2355 
2356     if ((buf = reread_file(file_data)) == NULL) {
2357         return -1;
2358     }
2359 
2360     if (!parse_int64(buf, &mem_usage)) {
2361         ALOGE("%s parse error", file_data->filename);
2362         return -1;
2363     }
2364     if (mem_usage == 0) {
2365         ALOGE("No memory!");
2366         return -1;
2367     }
2368     return mem_usage;
2369 }
2370 
record_low_pressure_levels(union meminfo * mi)2371 void record_low_pressure_levels(union meminfo *mi) {
2372     if (low_pressure_mem.min_nr_free_pages == -1 ||
2373         low_pressure_mem.min_nr_free_pages > mi->field.nr_free_pages) {
2374         if (debug_process_killing) {
2375             ALOGI("Low pressure min memory update from %" PRId64 " to %" PRId64,
2376                 low_pressure_mem.min_nr_free_pages, mi->field.nr_free_pages);
2377         }
2378         low_pressure_mem.min_nr_free_pages = mi->field.nr_free_pages;
2379     }
2380     /*
2381      * Free memory at low vmpressure events occasionally gets spikes,
2382      * possibly a stale low vmpressure event with memory already
2383      * freed up (no memory pressure should have been reported).
2384      * Ignore large jumps in max_nr_free_pages that would mess up our stats.
2385      */
2386     if (low_pressure_mem.max_nr_free_pages == -1 ||
2387         (low_pressure_mem.max_nr_free_pages < mi->field.nr_free_pages &&
2388          mi->field.nr_free_pages - low_pressure_mem.max_nr_free_pages <
2389          low_pressure_mem.max_nr_free_pages * 0.1)) {
2390         if (debug_process_killing) {
2391             ALOGI("Low pressure max memory update from %" PRId64 " to %" PRId64,
2392                 low_pressure_mem.max_nr_free_pages, mi->field.nr_free_pages);
2393         }
2394         low_pressure_mem.max_nr_free_pages = mi->field.nr_free_pages;
2395     }
2396 }
2397 
upgrade_level(enum vmpressure_level level)2398 enum vmpressure_level upgrade_level(enum vmpressure_level level) {
2399     return (enum vmpressure_level)((level < VMPRESS_LEVEL_CRITICAL) ?
2400         level + 1 : level);
2401 }
2402 
downgrade_level(enum vmpressure_level level)2403 enum vmpressure_level downgrade_level(enum vmpressure_level level) {
2404     return (enum vmpressure_level)((level > VMPRESS_LEVEL_LOW) ?
2405         level - 1 : level);
2406 }
2407 
2408 enum zone_watermark {
2409     WMARK_MIN = 0,
2410     WMARK_LOW,
2411     WMARK_HIGH,
2412     WMARK_NONE
2413 };
2414 
2415 struct zone_watermarks {
2416     long high_wmark;
2417     long low_wmark;
2418     long min_wmark;
2419 };
2420 
2421 /*
2422  * Returns lowest breached watermark or WMARK_NONE.
2423  */
get_lowest_watermark(union meminfo * mi,struct zone_watermarks * watermarks)2424 static enum zone_watermark get_lowest_watermark(union meminfo *mi,
2425                                                 struct zone_watermarks *watermarks)
2426 {
2427     int64_t nr_free_pages = mi->field.nr_free_pages - mi->field.cma_free;
2428 
2429     if (nr_free_pages < watermarks->min_wmark) {
2430         return WMARK_MIN;
2431     }
2432     if (nr_free_pages < watermarks->low_wmark) {
2433         return WMARK_LOW;
2434     }
2435     if (nr_free_pages < watermarks->high_wmark) {
2436         return WMARK_HIGH;
2437     }
2438     return WMARK_NONE;
2439 }
2440 
calc_zone_watermarks(struct zoneinfo * zi,struct zone_watermarks * watermarks)2441 void calc_zone_watermarks(struct zoneinfo *zi, struct zone_watermarks *watermarks) {
2442     memset(watermarks, 0, sizeof(struct zone_watermarks));
2443 
2444     for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
2445         struct zoneinfo_node *node = &zi->nodes[node_idx];
2446         for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
2447             struct zoneinfo_zone *zone = &node->zones[zone_idx];
2448 
2449             if (!zone->fields.field.present) {
2450                 continue;
2451             }
2452 
2453             watermarks->high_wmark += zone->max_protection + zone->fields.field.high;
2454             watermarks->low_wmark += zone->max_protection + zone->fields.field.low;
2455             watermarks->min_wmark += zone->max_protection + zone->fields.field.min;
2456         }
2457     }
2458 }
2459 
calc_swap_utilization(union meminfo * mi)2460 static int calc_swap_utilization(union meminfo *mi) {
2461     int64_t swap_used = mi->field.total_swap - mi->field.free_swap;
2462     int64_t total_swappable = mi->field.active_anon + mi->field.inactive_anon +
2463                               mi->field.shmem + swap_used;
2464     return total_swappable > 0 ? (swap_used * 100) / total_swappable : 0;
2465 }
2466 
mp_event_psi(int data,uint32_t events,struct polling_params * poll_params)2467 static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
2468     enum reclaim_state {
2469         NO_RECLAIM = 0,
2470         KSWAPD_RECLAIM,
2471         DIRECT_RECLAIM,
2472     };
2473     static int64_t init_ws_refault;
2474     static int64_t prev_workingset_refault;
2475     static int64_t base_file_lru;
2476     static int64_t init_pgscan_kswapd;
2477     static int64_t init_pgscan_direct;
2478     static int64_t swap_low_threshold;
2479     static bool killing;
2480     static int thrashing_limit = thrashing_limit_pct;
2481     static struct zone_watermarks watermarks;
2482     static struct timespec wmark_update_tm;
2483     static struct wakeup_info wi;
2484     static struct timespec thrashing_reset_tm;
2485     static int64_t prev_thrash_growth = 0;
2486     static bool check_filecache = false;
2487     static int max_thrashing = 0;
2488 
2489     union meminfo mi;
2490     union vmstat vs;
2491     struct psi_data psi_data;
2492     struct timespec curr_tm;
2493     int64_t thrashing = 0;
2494     bool swap_is_low = false;
2495     enum vmpressure_level level = (enum vmpressure_level)data;
2496     enum kill_reasons kill_reason = NONE;
2497     bool cycle_after_kill = false;
2498     enum reclaim_state reclaim = NO_RECLAIM;
2499     enum zone_watermark wmark = WMARK_NONE;
2500     char kill_desc[LINE_MAX];
2501     bool cut_thrashing_limit = false;
2502     int min_score_adj = 0;
2503     int swap_util = 0;
2504     long since_thrashing_reset_ms;
2505     int64_t workingset_refault_file;
2506     bool critical_stall = false;
2507 
2508     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2509         ALOGE("Failed to get current time");
2510         return;
2511     }
2512 
2513     record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
2514 
2515     bool kill_pending = is_kill_pending();
2516     if (kill_pending && (kill_timeout_ms == 0 ||
2517         get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms))) {
2518         /* Skip while still killing a process */
2519         wi.skipped_wakeups++;
2520         goto no_kill;
2521     }
2522     /*
2523      * Process is dead or kill timeout is over, stop waiting. This has no effect if pidfds are
2524      * supported and death notification already caused waiting to stop.
2525      */
2526     stop_wait_for_proc_kill(!kill_pending);
2527 
2528     if (vmstat_parse(&vs) < 0) {
2529         ALOGE("Failed to parse vmstat!");
2530         return;
2531     }
2532     /* Starting 5.9 kernel workingset_refault vmstat field was renamed workingset_refault_file */
2533     workingset_refault_file = vs.field.workingset_refault ? : vs.field.workingset_refault_file;
2534 
2535     if (meminfo_parse(&mi) < 0) {
2536         ALOGE("Failed to parse meminfo!");
2537         return;
2538     }
2539 
2540     /* Reset states after process got killed */
2541     if (killing) {
2542         killing = false;
2543         cycle_after_kill = true;
2544         /* Reset file-backed pagecache size and refault amounts after a kill */
2545         base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2546         init_ws_refault = workingset_refault_file;
2547         thrashing_reset_tm = curr_tm;
2548         prev_thrash_growth = 0;
2549     }
2550 
2551     /* Check free swap levels */
2552     if (swap_free_low_percentage) {
2553         if (!swap_low_threshold) {
2554             swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
2555         }
2556         swap_is_low = mi.field.free_swap < swap_low_threshold;
2557     }
2558 
2559     /* Identify reclaim state */
2560     if (vs.field.pgscan_direct > init_pgscan_direct) {
2561         init_pgscan_direct = vs.field.pgscan_direct;
2562         init_pgscan_kswapd = vs.field.pgscan_kswapd;
2563         reclaim = DIRECT_RECLAIM;
2564     } else if (vs.field.pgscan_kswapd > init_pgscan_kswapd) {
2565         init_pgscan_kswapd = vs.field.pgscan_kswapd;
2566         reclaim = KSWAPD_RECLAIM;
2567     } else if (workingset_refault_file == prev_workingset_refault) {
2568         /*
2569          * Device is not thrashing and not reclaiming, bail out early until we see these stats
2570          * changing
2571          */
2572         goto no_kill;
2573     }
2574 
2575     prev_workingset_refault = workingset_refault_file;
2576 
2577      /*
2578      * It's possible we fail to find an eligible process to kill (ex. no process is
2579      * above oom_adj_min). When this happens, we should retry to find a new process
2580      * for a kill whenever a new eligible process is available. This is especially
2581      * important for a slow growing refault case. While retrying, we should keep
2582      * monitoring new thrashing counter as someone could release the memory to mitigate
2583      * the thrashing. Thus, when thrashing reset window comes, we decay the prev thrashing
2584      * counter by window counts. If the counter is still greater than thrashing limit,
2585      * we preserve the current prev_thrash counter so we will retry kill again. Otherwise,
2586      * we reset the prev_thrash counter so we will stop retrying.
2587      */
2588     since_thrashing_reset_ms = get_time_diff_ms(&thrashing_reset_tm, &curr_tm);
2589     if (since_thrashing_reset_ms > THRASHING_RESET_INTERVAL_MS) {
2590         long windows_passed;
2591         /* Calculate prev_thrash_growth if we crossed THRASHING_RESET_INTERVAL_MS */
2592         prev_thrash_growth = (workingset_refault_file - init_ws_refault) * 100
2593                             / (base_file_lru + 1);
2594         windows_passed = (since_thrashing_reset_ms / THRASHING_RESET_INTERVAL_MS);
2595         /*
2596          * Decay prev_thrashing unless over-the-limit thrashing was registered in the window we
2597          * just crossed, which means there were no eligible processes to kill. We preserve the
2598          * counter in that case to ensure a kill if a new eligible process appears.
2599          */
2600         if (windows_passed > 1 || prev_thrash_growth < thrashing_limit) {
2601             prev_thrash_growth >>= windows_passed;
2602         }
2603 
2604         /* Record file-backed pagecache size when crossing THRASHING_RESET_INTERVAL_MS */
2605         base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2606         init_ws_refault = workingset_refault_file;
2607         thrashing_reset_tm = curr_tm;
2608         thrashing_limit = thrashing_limit_pct;
2609     } else {
2610         /* Calculate what % of the file-backed pagecache refaulted so far */
2611         thrashing = (workingset_refault_file - init_ws_refault) * 100 / (base_file_lru + 1);
2612     }
2613     /* Add previous cycle's decayed thrashing amount */
2614     thrashing += prev_thrash_growth;
2615     if (max_thrashing < thrashing) {
2616         max_thrashing = thrashing;
2617     }
2618 
2619     /*
2620      * Refresh watermarks once per min in case user updated one of the margins.
2621      * TODO: b/140521024 replace this periodic update with an API for AMS to notify LMKD
2622      * that zone watermarks were changed by the system software.
2623      */
2624     if (watermarks.high_wmark == 0 || get_time_diff_ms(&wmark_update_tm, &curr_tm) > 60000) {
2625         struct zoneinfo zi;
2626 
2627         if (zoneinfo_parse(&zi) < 0) {
2628             ALOGE("Failed to parse zoneinfo!");
2629             return;
2630         }
2631 
2632         calc_zone_watermarks(&zi, &watermarks);
2633         wmark_update_tm = curr_tm;
2634     }
2635 
2636     /* Find out which watermark is breached if any */
2637     wmark = get_lowest_watermark(&mi, &watermarks);
2638 
2639     if (!psi_parse_mem(&psi_data)) {
2640         critical_stall = psi_data.mem_stats[PSI_FULL].avg10 > (float)stall_limit_critical;
2641     }
2642     /*
2643      * TODO: move this logic into a separate function
2644      * Decide if killing a process is necessary and record the reason
2645      */
2646     if (cycle_after_kill && wmark < WMARK_LOW) {
2647         /*
2648          * Prevent kills not freeing enough memory which might lead to OOM kill.
2649          * This might happen when a process is consuming memory faster than reclaim can
2650          * free even after a kill. Mostly happens when running memory stress tests.
2651          */
2652         kill_reason = PRESSURE_AFTER_KILL;
2653         strncpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc));
2654     } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
2655         /*
2656          * Device is too busy reclaiming memory which might lead to ANR.
2657          * Critical level is triggered when PSI complete stall (all tasks are blocked because
2658          * of the memory congestion) breaches the configured threshold.
2659          */
2660         kill_reason = NOT_RESPONDING;
2661         strncpy(kill_desc, "device is not responding", sizeof(kill_desc));
2662     } else if (swap_is_low && thrashing > thrashing_limit_pct) {
2663         /* Page cache is thrashing while swap is low */
2664         kill_reason = LOW_SWAP_AND_THRASHING;
2665         snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64
2666             "kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)",
2667             mi.field.free_swap * page_k, swap_low_threshold * page_k, thrashing);
2668         /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2669         if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2670             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2671         }
2672         check_filecache = true;
2673     } else if (swap_is_low && wmark < WMARK_HIGH) {
2674         /* Both free memory and swap are low */
2675         kill_reason = LOW_MEM_AND_SWAP;
2676         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%"
2677             PRId64 "kB < %" PRId64 "kB)", wmark < WMARK_LOW ? "min" : "low",
2678             mi.field.free_swap * page_k, swap_low_threshold * page_k);
2679         /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2680         if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2681             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2682         }
2683     } else if (wmark < WMARK_HIGH && swap_util_max < 100 &&
2684                (swap_util = calc_swap_utilization(&mi)) > swap_util_max) {
2685         /*
2686          * Too much anon memory is swapped out but swap is not low.
2687          * Non-swappable allocations created memory pressure.
2688          */
2689         kill_reason = LOW_MEM_AND_SWAP_UTIL;
2690         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap utilization"
2691             " is high (%d%% > %d%%)", wmark < WMARK_LOW ? "min" : "low",
2692             swap_util, swap_util_max);
2693     } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
2694         /* Page cache is thrashing while memory is low */
2695         kill_reason = LOW_MEM_AND_THRASHING;
2696         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%"
2697             PRId64 "%%)", wmark < WMARK_LOW ? "min" : "low", thrashing);
2698         cut_thrashing_limit = true;
2699         /* Do not kill perceptible apps unless thrashing at critical levels */
2700         if (thrashing < thrashing_critical_pct) {
2701             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2702         }
2703         check_filecache = true;
2704     } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
2705         /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
2706         kill_reason = DIRECT_RECL_AND_THRASHING;
2707         snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%"
2708             PRId64 "%%)", thrashing);
2709         cut_thrashing_limit = true;
2710         /* Do not kill perceptible apps unless thrashing at critical levels */
2711         if (thrashing < thrashing_critical_pct) {
2712             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2713         }
2714         check_filecache = true;
2715     } else if (check_filecache) {
2716         int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k;
2717 
2718         if (file_lru_kb < filecache_min_kb) {
2719             /* File cache is too low after thrashing, keep killing background processes */
2720             kill_reason = LOW_FILECACHE_AFTER_THRASHING;
2721             snprintf(kill_desc, sizeof(kill_desc),
2722                 "filecache is low (%" PRId64 "kB < %" PRId64 "kB) after thrashing",
2723                 file_lru_kb, filecache_min_kb);
2724             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2725         } else {
2726             /* File cache is big enough, stop checking */
2727             check_filecache = false;
2728         }
2729     }
2730 
2731     /* Kill a process if necessary */
2732     if (kill_reason != NONE) {
2733         struct kill_info ki = {
2734             .kill_reason = kill_reason,
2735             .kill_desc = kill_desc,
2736             .thrashing = (int)thrashing,
2737             .max_thrashing = max_thrashing,
2738         };
2739 
2740         /* Allow killing perceptible apps if the system is stalled */
2741         if (critical_stall) {
2742             min_score_adj = 0;
2743         }
2744         psi_parse_io(&psi_data);
2745         psi_parse_cpu(&psi_data);
2746         int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm, &psi_data);
2747         if (pages_freed > 0) {
2748             killing = true;
2749             max_thrashing = 0;
2750             if (cut_thrashing_limit) {
2751                 /*
2752                  * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current
2753                  * thrashing limit until the system stops thrashing.
2754                  */
2755                 thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
2756             }
2757         }
2758     }
2759 
2760 no_kill:
2761     /* Do not poll if kernel supports pidfd waiting */
2762     if (is_waiting_for_kill()) {
2763         /* Pause polling if we are waiting for process death notification */
2764         poll_params->update = POLLING_PAUSE;
2765         return;
2766     }
2767 
2768     /*
2769      * Start polling after initial PSI event;
2770      * extend polling while device is in direct reclaim or process is being killed;
2771      * do not extend when kswapd reclaims because that might go on for a long time
2772      * without causing memory pressure
2773      */
2774     if (events || killing || reclaim == DIRECT_RECLAIM) {
2775         poll_params->update = POLLING_START;
2776     }
2777 
2778     /* Decide the polling interval */
2779     if (swap_is_low || killing) {
2780         /* Fast polling during and after a kill or when swap is low */
2781         poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2782     } else {
2783         /* By default use long intervals */
2784         poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS;
2785     }
2786 }
2787 
mp_event_common(int data,uint32_t events,struct polling_params * poll_params)2788 static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) {
2789     unsigned long long evcount;
2790     int64_t mem_usage, memsw_usage;
2791     int64_t mem_pressure;
2792     union meminfo mi;
2793     struct zoneinfo zi;
2794     struct timespec curr_tm;
2795     static unsigned long kill_skip_count = 0;
2796     enum vmpressure_level level = (enum vmpressure_level)data;
2797     long other_free = 0, other_file = 0;
2798     int min_score_adj;
2799     int minfree = 0;
2800     static struct reread_data mem_usage_file_data = {
2801         .filename = MEMCG_MEMORY_USAGE,
2802         .fd = -1,
2803     };
2804     static struct reread_data memsw_usage_file_data = {
2805         .filename = MEMCG_MEMORYSW_USAGE,
2806         .fd = -1,
2807     };
2808     static struct wakeup_info wi;
2809 
2810     if (debug_process_killing) {
2811         ALOGI("%s memory pressure event is triggered", level_name[level]);
2812     }
2813 
2814     if (!use_psi_monitors) {
2815         /*
2816          * Check all event counters from low to critical
2817          * and upgrade to the highest priority one. By reading
2818          * eventfd we also reset the event counters.
2819          */
2820         for (int lvl = VMPRESS_LEVEL_LOW; lvl < VMPRESS_LEVEL_COUNT; lvl++) {
2821             if (mpevfd[lvl] != -1 &&
2822                 TEMP_FAILURE_RETRY(read(mpevfd[lvl],
2823                                    &evcount, sizeof(evcount))) > 0 &&
2824                 evcount > 0 && lvl > level) {
2825                 level = static_cast<vmpressure_level>(lvl);
2826             }
2827         }
2828     }
2829 
2830     /* Start polling after initial PSI event */
2831     if (use_psi_monitors && events) {
2832         /* Override polling params only if current event is more critical */
2833         if (!poll_params->poll_handler || data > poll_params->poll_handler->data) {
2834             poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2835             poll_params->update = POLLING_START;
2836         }
2837     }
2838 
2839     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2840         ALOGE("Failed to get current time");
2841         return;
2842     }
2843 
2844     record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
2845 
2846     if (kill_timeout_ms &&
2847         get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms)) {
2848         /*
2849          * If we're within the no-kill timeout, see if there's pending reclaim work
2850          * from the last killed process. If so, skip killing for now.
2851          */
2852         if (is_kill_pending()) {
2853             kill_skip_count++;
2854             wi.skipped_wakeups++;
2855             return;
2856         }
2857         /*
2858          * Process is dead, stop waiting. This has no effect if pidfds are supported and
2859          * death notification already caused waiting to stop.
2860          */
2861         stop_wait_for_proc_kill(true);
2862     } else {
2863         /*
2864          * Killing took longer than no-kill timeout. Stop waiting for the last process
2865          * to die because we are ready to kill again.
2866          */
2867         stop_wait_for_proc_kill(false);
2868     }
2869 
2870     if (kill_skip_count > 0) {
2871         ALOGI("%lu memory pressure events were skipped after a kill!",
2872               kill_skip_count);
2873         kill_skip_count = 0;
2874     }
2875 
2876     if (meminfo_parse(&mi) < 0 || zoneinfo_parse(&zi) < 0) {
2877         ALOGE("Failed to get free memory!");
2878         return;
2879     }
2880 
2881     if (use_minfree_levels) {
2882         int i;
2883 
2884         other_free = mi.field.nr_free_pages - zi.totalreserve_pages;
2885         if (mi.field.nr_file_pages > (mi.field.shmem + mi.field.unevictable + mi.field.swap_cached)) {
2886             other_file = (mi.field.nr_file_pages - mi.field.shmem -
2887                           mi.field.unevictable - mi.field.swap_cached);
2888         } else {
2889             other_file = 0;
2890         }
2891 
2892         min_score_adj = OOM_SCORE_ADJ_MAX + 1;
2893         for (i = 0; i < lowmem_targets_size; i++) {
2894             minfree = lowmem_minfree[i];
2895             if (other_free < minfree && other_file < minfree) {
2896                 min_score_adj = lowmem_adj[i];
2897                 break;
2898             }
2899         }
2900 
2901         if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
2902             if (debug_process_killing) {
2903                 ALOGI("Ignore %s memory pressure event "
2904                       "(free memory=%ldkB, cache=%ldkB, limit=%ldkB)",
2905                       level_name[level], other_free * page_k, other_file * page_k,
2906                       (long)lowmem_minfree[lowmem_targets_size - 1] * page_k);
2907             }
2908             return;
2909         }
2910 
2911         goto do_kill;
2912     }
2913 
2914     if (level == VMPRESS_LEVEL_LOW) {
2915         record_low_pressure_levels(&mi);
2916     }
2917 
2918     if (level_oomadj[level] > OOM_SCORE_ADJ_MAX) {
2919         /* Do not monitor this pressure level */
2920         return;
2921     }
2922 
2923     if ((mem_usage = get_memory_usage(&mem_usage_file_data)) < 0) {
2924         goto do_kill;
2925     }
2926     if ((memsw_usage = get_memory_usage(&memsw_usage_file_data)) < 0) {
2927         goto do_kill;
2928     }
2929 
2930     // Calculate percent for swappinness.
2931     mem_pressure = (mem_usage * 100) / memsw_usage;
2932 
2933     if (enable_pressure_upgrade && level != VMPRESS_LEVEL_CRITICAL) {
2934         // We are swapping too much.
2935         if (mem_pressure < upgrade_pressure) {
2936             level = upgrade_level(level);
2937             if (debug_process_killing) {
2938                 ALOGI("Event upgraded to %s", level_name[level]);
2939             }
2940         }
2941     }
2942 
2943     // If we still have enough swap space available, check if we want to
2944     // ignore/downgrade pressure events.
2945     if (mi.field.free_swap >=
2946         mi.field.total_swap * swap_free_low_percentage / 100) {
2947         // If the pressure is larger than downgrade_pressure lmk will not
2948         // kill any process, since enough memory is available.
2949         if (mem_pressure > downgrade_pressure) {
2950             if (debug_process_killing) {
2951                 ALOGI("Ignore %s memory pressure", level_name[level]);
2952             }
2953             return;
2954         } else if (level == VMPRESS_LEVEL_CRITICAL && mem_pressure > upgrade_pressure) {
2955             if (debug_process_killing) {
2956                 ALOGI("Downgrade critical memory pressure");
2957             }
2958             // Downgrade event, since enough memory available.
2959             level = downgrade_level(level);
2960         }
2961     }
2962 
2963 do_kill:
2964     if (low_ram_device) {
2965         /* For Go devices kill only one task */
2966         if (find_and_kill_process(level_oomadj[level], NULL, &mi, &wi, &curr_tm, NULL) == 0) {
2967             if (debug_process_killing) {
2968                 ALOGI("Nothing to kill");
2969             }
2970         }
2971     } else {
2972         int pages_freed;
2973         static struct timespec last_report_tm;
2974         static unsigned long report_skip_count = 0;
2975 
2976         if (!use_minfree_levels) {
2977             /* Free up enough memory to downgrate the memory pressure to low level */
2978             if (mi.field.nr_free_pages >= low_pressure_mem.max_nr_free_pages) {
2979                 if (debug_process_killing) {
2980                     ALOGI("Ignoring pressure since more memory is "
2981                         "available (%" PRId64 ") than watermark (%" PRId64 ")",
2982                         mi.field.nr_free_pages, low_pressure_mem.max_nr_free_pages);
2983                 }
2984                 return;
2985             }
2986             min_score_adj = level_oomadj[level];
2987         }
2988 
2989         pages_freed = find_and_kill_process(min_score_adj, NULL, &mi, &wi, &curr_tm, NULL);
2990 
2991         if (pages_freed == 0) {
2992             /* Rate limit kill reports when nothing was reclaimed */
2993             if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) {
2994                 report_skip_count++;
2995                 return;
2996             }
2997         }
2998 
2999         /* Log whenever we kill or when report rate limit allows */
3000         if (use_minfree_levels) {
3001             ALOGI("Reclaimed %ldkB, cache(%ldkB) and free(%" PRId64 "kB)-reserved(%" PRId64 "kB) "
3002                 "below min(%ldkB) for oom_score_adj %d",
3003                 pages_freed * page_k,
3004                 other_file * page_k, mi.field.nr_free_pages * page_k,
3005                 zi.totalreserve_pages * page_k,
3006                 minfree * page_k, min_score_adj);
3007         } else {
3008             ALOGI("Reclaimed %ldkB at oom_score_adj %d", pages_freed * page_k, min_score_adj);
3009         }
3010 
3011         if (report_skip_count > 0) {
3012             ALOGI("Suppressed %lu failed kill reports", report_skip_count);
3013             report_skip_count = 0;
3014         }
3015 
3016         last_report_tm = curr_tm;
3017     }
3018     if (is_waiting_for_kill()) {
3019         /* pause polling if we are waiting for process death notification */
3020         poll_params->update = POLLING_PAUSE;
3021     }
3022 }
3023 
init_mp_psi(enum vmpressure_level level,bool use_new_strategy)3024 static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
3025     int fd;
3026 
3027     /* Do not register a handler if threshold_ms is not set */
3028     if (!psi_thresholds[level].threshold_ms) {
3029         return true;
3030     }
3031 
3032     fd = init_psi_monitor(psi_thresholds[level].stall_type,
3033         psi_thresholds[level].threshold_ms * US_PER_MS,
3034         PSI_WINDOW_SIZE_MS * US_PER_MS);
3035 
3036     if (fd < 0) {
3037         return false;
3038     }
3039 
3040     vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
3041     vmpressure_hinfo[level].data = level;
3042     if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) {
3043         destroy_psi_monitor(fd);
3044         return false;
3045     }
3046     maxevents++;
3047     mpevfd[level] = fd;
3048 
3049     return true;
3050 }
3051 
destroy_mp_psi(enum vmpressure_level level)3052 static void destroy_mp_psi(enum vmpressure_level level) {
3053     int fd = mpevfd[level];
3054 
3055     if (fd < 0) {
3056         return;
3057     }
3058 
3059     if (unregister_psi_monitor(epollfd, fd) < 0) {
3060         ALOGE("Failed to unregister psi monitor for %s memory pressure; errno=%d",
3061             level_name[level], errno);
3062     }
3063     maxevents--;
3064     destroy_psi_monitor(fd);
3065     mpevfd[level] = -1;
3066 }
3067 
init_psi_monitors()3068 static bool init_psi_monitors() {
3069     /*
3070      * When PSI is used on low-ram devices or on high-end devices without memfree levels
3071      * use new kill strategy based on zone watermarks, free swap and thrashing stats
3072      */
3073     bool use_new_strategy =
3074         GET_LMK_PROPERTY(bool, "use_new_strategy", low_ram_device || !use_minfree_levels);
3075 
3076     /* In default PSI mode override stall amounts using system properties */
3077     if (use_new_strategy) {
3078         /* Do not use low pressure level */
3079         psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0;
3080         psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms;
3081         psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms;
3082     }
3083 
3084     if (!init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy)) {
3085         return false;
3086     }
3087     if (!init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy)) {
3088         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3089         return false;
3090     }
3091     if (!init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy)) {
3092         destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
3093         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3094         return false;
3095     }
3096     return true;
3097 }
3098 
init_mp_common(enum vmpressure_level level)3099 static bool init_mp_common(enum vmpressure_level level) {
3100     int mpfd;
3101     int evfd;
3102     int evctlfd;
3103     char buf[256];
3104     struct epoll_event epev;
3105     int ret;
3106     int level_idx = (int)level;
3107     const char *levelstr = level_name[level_idx];
3108 
3109     /* gid containing AID_SYSTEM required */
3110     mpfd = open(MEMCG_SYSFS_PATH "memory.pressure_level", O_RDONLY | O_CLOEXEC);
3111     if (mpfd < 0) {
3112         ALOGI("No kernel memory.pressure_level support (errno=%d)", errno);
3113         goto err_open_mpfd;
3114     }
3115 
3116     evctlfd = open(MEMCG_SYSFS_PATH "cgroup.event_control", O_WRONLY | O_CLOEXEC);
3117     if (evctlfd < 0) {
3118         ALOGI("No kernel memory cgroup event control (errno=%d)", errno);
3119         goto err_open_evctlfd;
3120     }
3121 
3122     evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
3123     if (evfd < 0) {
3124         ALOGE("eventfd failed for level %s; errno=%d", levelstr, errno);
3125         goto err_eventfd;
3126     }
3127 
3128     ret = snprintf(buf, sizeof(buf), "%d %d %s", evfd, mpfd, levelstr);
3129     if (ret >= (ssize_t)sizeof(buf)) {
3130         ALOGE("cgroup.event_control line overflow for level %s", levelstr);
3131         goto err;
3132     }
3133 
3134     ret = TEMP_FAILURE_RETRY(write(evctlfd, buf, strlen(buf) + 1));
3135     if (ret == -1) {
3136         ALOGE("cgroup.event_control write failed for level %s; errno=%d",
3137               levelstr, errno);
3138         goto err;
3139     }
3140 
3141     epev.events = EPOLLIN;
3142     /* use data to store event level */
3143     vmpressure_hinfo[level_idx].data = level_idx;
3144     vmpressure_hinfo[level_idx].handler = mp_event_common;
3145     epev.data.ptr = (void *)&vmpressure_hinfo[level_idx];
3146     ret = epoll_ctl(epollfd, EPOLL_CTL_ADD, evfd, &epev);
3147     if (ret == -1) {
3148         ALOGE("epoll_ctl for level %s failed; errno=%d", levelstr, errno);
3149         goto err;
3150     }
3151     maxevents++;
3152     mpevfd[level] = evfd;
3153     close(evctlfd);
3154     return true;
3155 
3156 err:
3157     close(evfd);
3158 err_eventfd:
3159     close(evctlfd);
3160 err_open_evctlfd:
3161     close(mpfd);
3162 err_open_mpfd:
3163     return false;
3164 }
3165 
destroy_mp_common(enum vmpressure_level level)3166 static void destroy_mp_common(enum vmpressure_level level) {
3167     struct epoll_event epev;
3168     int fd = mpevfd[level];
3169 
3170     if (fd < 0) {
3171         return;
3172     }
3173 
3174     if (epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &epev)) {
3175         // Log an error and keep going
3176         ALOGE("epoll_ctl for level %s failed; errno=%d", level_name[level], errno);
3177     }
3178     maxevents--;
3179     close(fd);
3180     mpevfd[level] = -1;
3181 }
3182 
kernel_event_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)3183 static void kernel_event_handler(int data __unused, uint32_t events __unused,
3184                                  struct polling_params *poll_params __unused) {
3185     poll_kernel(kpoll_fd);
3186 }
3187 
init_monitors()3188 static bool init_monitors() {
3189     /* Try to use psi monitor first if kernel has it */
3190     use_psi_monitors = GET_LMK_PROPERTY(bool, "use_psi", true) &&
3191         init_psi_monitors();
3192     /* Fall back to vmpressure */
3193     if (!use_psi_monitors &&
3194         (!init_mp_common(VMPRESS_LEVEL_LOW) ||
3195         !init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
3196         !init_mp_common(VMPRESS_LEVEL_CRITICAL))) {
3197         ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
3198         return false;
3199     }
3200     if (use_psi_monitors) {
3201         ALOGI("Using psi monitors for memory pressure detection");
3202     } else {
3203         ALOGI("Using vmpressure for memory pressure detection");
3204     }
3205     return true;
3206 }
3207 
destroy_monitors()3208 static void destroy_monitors() {
3209     if (use_psi_monitors) {
3210         destroy_mp_psi(VMPRESS_LEVEL_CRITICAL);
3211         destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
3212         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3213     } else {
3214         destroy_mp_common(VMPRESS_LEVEL_CRITICAL);
3215         destroy_mp_common(VMPRESS_LEVEL_MEDIUM);
3216         destroy_mp_common(VMPRESS_LEVEL_LOW);
3217     }
3218 }
3219 
init(void)3220 static int init(void) {
3221     static struct event_handler_info kernel_poll_hinfo = { 0, kernel_event_handler };
3222     struct reread_data file_data = {
3223         .filename = ZONEINFO_PATH,
3224         .fd = -1,
3225     };
3226     struct epoll_event epev;
3227     int pidfd;
3228     int i;
3229     int ret;
3230 
3231     page_k = sysconf(_SC_PAGESIZE);
3232     if (page_k == -1)
3233         page_k = PAGE_SIZE;
3234     page_k /= 1024;
3235 
3236     epollfd = epoll_create(MAX_EPOLL_EVENTS);
3237     if (epollfd == -1) {
3238         ALOGE("epoll_create failed (errno=%d)", errno);
3239         return -1;
3240     }
3241 
3242     // mark data connections as not connected
3243     for (int i = 0; i < MAX_DATA_CONN; i++) {
3244         data_sock[i].sock = -1;
3245     }
3246 
3247     ctrl_sock.sock = android_get_control_socket("lmkd");
3248     if (ctrl_sock.sock < 0) {
3249         ALOGE("get lmkd control socket failed");
3250         return -1;
3251     }
3252 
3253     ret = listen(ctrl_sock.sock, MAX_DATA_CONN);
3254     if (ret < 0) {
3255         ALOGE("lmkd control socket listen failed (errno=%d)", errno);
3256         return -1;
3257     }
3258 
3259     epev.events = EPOLLIN;
3260     ctrl_sock.handler_info.handler = ctrl_connect_handler;
3261     epev.data.ptr = (void *)&(ctrl_sock.handler_info);
3262     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) {
3263         ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno);
3264         return -1;
3265     }
3266     maxevents++;
3267 
3268     has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
3269     use_inkernel_interface = has_inkernel_module;
3270 
3271     if (use_inkernel_interface) {
3272         ALOGI("Using in-kernel low memory killer interface");
3273         if (init_poll_kernel()) {
3274             epev.events = EPOLLIN;
3275             epev.data.ptr = (void*)&kernel_poll_hinfo;
3276             if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_fd, &epev) != 0) {
3277                 ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno);
3278                 close(kpoll_fd);
3279                 kpoll_fd = -1;
3280             } else {
3281                 maxevents++;
3282                 /* let the others know it does support reporting kills */
3283                 property_set("sys.lmk.reportkills", "1");
3284             }
3285         }
3286     } else {
3287         if (!init_monitors()) {
3288             return -1;
3289         }
3290         /* let the others know it does support reporting kills */
3291         property_set("sys.lmk.reportkills", "1");
3292     }
3293 
3294     for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
3295         procadjslot_list[i].next = &procadjslot_list[i];
3296         procadjslot_list[i].prev = &procadjslot_list[i];
3297     }
3298 
3299     memset(killcnt_idx, KILLCNT_INVALID_IDX, sizeof(killcnt_idx));
3300 
3301     /*
3302      * Read zoneinfo as the biggest file we read to create and size the initial
3303      * read buffer and avoid memory re-allocations during memory pressure
3304      */
3305     if (reread_file(&file_data) == NULL) {
3306         ALOGE("Failed to read %s: %s", file_data.filename, strerror(errno));
3307     }
3308 
3309     /* check if kernel supports pidfd_open syscall */
3310     pidfd = TEMP_FAILURE_RETRY(pidfd_open(getpid(), 0));
3311     if (pidfd < 0) {
3312         pidfd_supported = (errno != ENOSYS);
3313     } else {
3314         pidfd_supported = true;
3315         close(pidfd);
3316     }
3317     ALOGI("Process polling is %s", pidfd_supported ? "supported" : "not supported" );
3318 
3319     return 0;
3320 }
3321 
polling_paused(struct polling_params * poll_params)3322 static bool polling_paused(struct polling_params *poll_params) {
3323     return poll_params->paused_handler != NULL;
3324 }
3325 
resume_polling(struct polling_params * poll_params,struct timespec curr_tm)3326 static void resume_polling(struct polling_params *poll_params, struct timespec curr_tm) {
3327     poll_params->poll_start_tm = curr_tm;
3328     poll_params->poll_handler = poll_params->paused_handler;
3329     poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
3330     poll_params->paused_handler = NULL;
3331 }
3332 
call_handler(struct event_handler_info * handler_info,struct polling_params * poll_params,uint32_t events)3333 static void call_handler(struct event_handler_info* handler_info,
3334                          struct polling_params *poll_params, uint32_t events) {
3335     struct timespec curr_tm;
3336 
3337     poll_params->update = POLLING_DO_NOT_CHANGE;
3338     handler_info->handler(handler_info->data, events, poll_params);
3339     clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3340     if (poll_params->poll_handler == handler_info) {
3341         poll_params->last_poll_tm = curr_tm;
3342     }
3343 
3344     switch (poll_params->update) {
3345     case POLLING_START:
3346         /*
3347          * Poll for the duration of PSI_WINDOW_SIZE_MS after the
3348          * initial PSI event because psi events are rate-limited
3349          * at one per sec.
3350          */
3351         poll_params->poll_start_tm = curr_tm;
3352         poll_params->poll_handler = handler_info;
3353         break;
3354     case POLLING_PAUSE:
3355         poll_params->paused_handler = handler_info;
3356         poll_params->poll_handler = NULL;
3357         break;
3358     case POLLING_RESUME:
3359         resume_polling(poll_params, curr_tm);
3360         break;
3361     case POLLING_DO_NOT_CHANGE:
3362         if (get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) {
3363             /* Polled for the duration of PSI window, time to stop */
3364             poll_params->poll_handler = NULL;
3365         }
3366         break;
3367     }
3368 }
3369 
mainloop(void)3370 static void mainloop(void) {
3371     struct event_handler_info* handler_info;
3372     struct polling_params poll_params;
3373     struct timespec curr_tm;
3374     struct epoll_event *evt;
3375     long delay = -1;
3376 
3377     poll_params.poll_handler = NULL;
3378     poll_params.paused_handler = NULL;
3379 
3380     while (1) {
3381         struct epoll_event events[MAX_EPOLL_EVENTS];
3382         int nevents;
3383         int i;
3384 
3385         if (poll_params.poll_handler) {
3386             bool poll_now;
3387 
3388             clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3389             if (poll_params.update == POLLING_RESUME) {
3390                 /* Just transitioned into POLLING_RESUME, poll immediately. */
3391                 poll_now = true;
3392                 nevents = 0;
3393             } else {
3394                 /* Calculate next timeout */
3395                 delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
3396                 delay = (delay < poll_params.polling_interval_ms) ?
3397                     poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;
3398 
3399                 /* Wait for events until the next polling timeout */
3400                 nevents = epoll_wait(epollfd, events, maxevents, delay);
3401 
3402                 /* Update current time after wait */
3403                 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3404                 poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
3405                     poll_params.polling_interval_ms);
3406             }
3407             if (poll_now) {
3408                 call_handler(poll_params.poll_handler, &poll_params, 0);
3409             }
3410         } else {
3411             if (kill_timeout_ms && is_waiting_for_kill()) {
3412                 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3413                 delay = kill_timeout_ms - get_time_diff_ms(&last_kill_tm, &curr_tm);
3414                 /* Wait for pidfds notification or kill timeout to expire */
3415                 nevents = (delay > 0) ? epoll_wait(epollfd, events, maxevents, delay) : 0;
3416                 if (nevents == 0) {
3417                     /* Kill notification timed out */
3418                     stop_wait_for_proc_kill(false);
3419                     if (polling_paused(&poll_params)) {
3420                         clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3421                         poll_params.update = POLLING_RESUME;
3422                         resume_polling(&poll_params, curr_tm);
3423                     }
3424                 }
3425             } else {
3426                 /* Wait for events with no timeout */
3427                 nevents = epoll_wait(epollfd, events, maxevents, -1);
3428             }
3429         }
3430 
3431         if (nevents == -1) {
3432             if (errno == EINTR)
3433                 continue;
3434             ALOGE("epoll_wait failed (errno=%d)", errno);
3435             continue;
3436         }
3437 
3438         /*
3439          * First pass to see if any data socket connections were dropped.
3440          * Dropped connection should be handled before any other events
3441          * to deallocate data connection and correctly handle cases when
3442          * connection gets dropped and reestablished in the same epoll cycle.
3443          * In such cases it's essential to handle connection closures first.
3444          */
3445         for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
3446             if ((evt->events & EPOLLHUP) && evt->data.ptr) {
3447                 ALOGI("lmkd data connection dropped");
3448                 handler_info = (struct event_handler_info*)evt->data.ptr;
3449                 ctrl_data_close(handler_info->data);
3450             }
3451         }
3452 
3453         /* Second pass to handle all other events */
3454         for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
3455             if (evt->events & EPOLLERR) {
3456                 ALOGD("EPOLLERR on event #%d", i);
3457             }
3458             if (evt->events & EPOLLHUP) {
3459                 /* This case was handled in the first pass */
3460                 continue;
3461             }
3462             if (evt->data.ptr) {
3463                 handler_info = (struct event_handler_info*)evt->data.ptr;
3464                 call_handler(handler_info, &poll_params, evt->events);
3465             }
3466         }
3467     }
3468 }
3469 
issue_reinit()3470 int issue_reinit() {
3471     int sock;
3472 
3473     sock = lmkd_connect();
3474     if (sock < 0) {
3475         ALOGE("failed to connect to lmkd: %s", strerror(errno));
3476         return -1;
3477     }
3478 
3479     enum update_props_result res = lmkd_update_props(sock);
3480     switch (res) {
3481     case UPDATE_PROPS_SUCCESS:
3482         ALOGI("lmkd updated properties successfully");
3483         break;
3484     case UPDATE_PROPS_SEND_ERR:
3485         ALOGE("failed to send lmkd request: %s", strerror(errno));
3486         break;
3487     case UPDATE_PROPS_RECV_ERR:
3488         ALOGE("failed to receive lmkd reply: %s", strerror(errno));
3489         break;
3490     case UPDATE_PROPS_FORMAT_ERR:
3491         ALOGE("lmkd reply is invalid");
3492         break;
3493     case UPDATE_PROPS_FAIL:
3494         ALOGE("lmkd failed to update its properties");
3495         break;
3496     }
3497 
3498     close(sock);
3499     return res == UPDATE_PROPS_SUCCESS ? 0 : -1;
3500 }
3501 
update_props()3502 static void update_props() {
3503     /* By default disable low level vmpressure events */
3504     level_oomadj[VMPRESS_LEVEL_LOW] =
3505         GET_LMK_PROPERTY(int32, "low", OOM_SCORE_ADJ_MAX + 1);
3506     level_oomadj[VMPRESS_LEVEL_MEDIUM] =
3507         GET_LMK_PROPERTY(int32, "medium", 800);
3508     level_oomadj[VMPRESS_LEVEL_CRITICAL] =
3509         GET_LMK_PROPERTY(int32, "critical", 0);
3510     debug_process_killing = GET_LMK_PROPERTY(bool, "debug", false);
3511 
3512     /* By default disable upgrade/downgrade logic */
3513     enable_pressure_upgrade =
3514         GET_LMK_PROPERTY(bool, "critical_upgrade", false);
3515     upgrade_pressure =
3516         (int64_t)GET_LMK_PROPERTY(int32, "upgrade_pressure", 100);
3517     downgrade_pressure =
3518         (int64_t)GET_LMK_PROPERTY(int32, "downgrade_pressure", 100);
3519     kill_heaviest_task =
3520         GET_LMK_PROPERTY(bool, "kill_heaviest_task", false);
3521     low_ram_device = property_get_bool("ro.config.low_ram", false);
3522     kill_timeout_ms =
3523         (unsigned long)GET_LMK_PROPERTY(int32, "kill_timeout_ms", 100);
3524     use_minfree_levels =
3525         GET_LMK_PROPERTY(bool, "use_minfree_levels", false);
3526     per_app_memcg =
3527         property_get_bool("ro.config.per_app_memcg", low_ram_device);
3528     swap_free_low_percentage = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_free_low_percentage",
3529         DEF_LOW_SWAP));
3530     psi_partial_stall_ms = GET_LMK_PROPERTY(int32, "psi_partial_stall_ms",
3531         low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL);
3532     psi_complete_stall_ms = GET_LMK_PROPERTY(int32, "psi_complete_stall_ms",
3533         DEF_COMPLETE_STALL);
3534     thrashing_limit_pct = max(0, GET_LMK_PROPERTY(int32, "thrashing_limit",
3535         low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
3536     thrashing_limit_decay_pct = clamp(0, 100, GET_LMK_PROPERTY(int32, "thrashing_limit_decay",
3537         low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
3538     thrashing_critical_pct = max(0, GET_LMK_PROPERTY(int32, "thrashing_limit_critical",
3539         thrashing_limit_pct * 2));
3540     swap_util_max = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_util_max", 100));
3541     filecache_min_kb = GET_LMK_PROPERTY(int64, "filecache_min_kb", 0);
3542     stall_limit_critical = GET_LMK_PROPERTY(int64, "stall_limit_critical", 100);
3543 }
3544 
main(int argc,char ** argv)3545 int main(int argc, char **argv) {
3546     if ((argc > 1) && argv[1] && !strcmp(argv[1], "--reinit")) {
3547         if (property_set(LMKD_REINIT_PROP, "")) {
3548             ALOGE("Failed to reset " LMKD_REINIT_PROP " property");
3549         }
3550         return issue_reinit();
3551     }
3552 
3553     update_props();
3554 
3555     ctx = create_android_logger(KILLINFO_LOG_TAG);
3556 
3557     if (!init()) {
3558         if (!use_inkernel_interface) {
3559             /*
3560              * MCL_ONFAULT pins pages as they fault instead of loading
3561              * everything immediately all at once. (Which would be bad,
3562              * because as of this writing, we have a lot of mapped pages we
3563              * never use.) Old kernels will see MCL_ONFAULT and fail with
3564              * EINVAL; we ignore this failure.
3565              *
3566              * N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
3567              * pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
3568              * in pages.
3569              */
3570             /* CAP_IPC_LOCK required */
3571             if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
3572                 ALOGW("mlockall failed %s", strerror(errno));
3573             }
3574 
3575             /* CAP_NICE required */
3576             struct sched_param param = {
3577                     .sched_priority = 1,
3578             };
3579             if (sched_setscheduler(0, SCHED_FIFO, &param)) {
3580                 ALOGW("set SCHED_FIFO failed %s", strerror(errno));
3581             }
3582         }
3583 
3584         mainloop();
3585     }
3586 
3587     android_log_destroy(&ctx);
3588 
3589     ALOGI("exiting");
3590     return 0;
3591 }
3592