Native crash的工作核心是由debuggerd守护进程来完成,上一篇文章 调试系列4:Debuggerd原理篇(上) ,已经介绍过Debuggerdd的工作原理。 要了解Native Crash,首先从应用程序入口位于 begin.S
中的 __linker_init
入手。
[-> arch/arm/begin.S]
ENTRY(_start) mov r0, sp //入口地址 【见小节1.2】 bl __linker_init /* linker init returns the _entry address in the main image */ mov pc, r0 END(_start)
[-> linker.cpp]
extern "C" ElfW(Addr) __linker_init(void* raw_args) { KernelArgumentBlock args(raw_args); ElfW(Addr) linker_addr = args.getauxval(AT_BASE); ... //【见小节1.3】 ElfW(Addr) start_address = __linker_init_post_relocation(args, linker_addr); return start_address; }
[-> linker.cpp]
static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) { ... // Sanitize the environment. __libc_init_AT_SECURE(args); // Initialize system properties __system_properties_init(); //【见小节1.4】 debuggerd_init(); ... }
[-> linker/debugger.cpp]
__LIBC_HIDDEN__ void debuggerd_init() { struct sigaction action; memset(&action, 0, sizeof(action)); sigemptyset(&action.sa_mask); //【见小节1.5】 action.sa_sigaction = debuggerd_signal_handler; //SA_RESTART代表中断某个syscall,则会自动重新调用该syscall //SA_SIGINFO代表信号附带参数siginfo_t结构体可传送到signal_handler函数 action.sa_flags = SA_RESTART | SA_SIGINFO; //使用备用signal栈(如果可用),以便我们能捕获栈溢出 action.sa_flags |= SA_ONSTACK; sigaction(SIGABRT, &action, nullptr); sigaction(SIGBUS, &action, nullptr); sigaction(SIGFPE, &action, nullptr); sigaction(SIGILL, &action, nullptr); sigaction(SIGPIPE, &action, nullptr); sigaction(SIGSEGV, &action, nullptr); #if defined(SIGSTKFLT) sigaction(SIGSTKFLT, &action, nullptr); #endif sigaction(SIGTRAP, &action, nullptr); }
连接到bionic上的native程序(C/C++)出现异常时,kernel会发送相应的signal; 当进程捕获致命的signal,通知debuggerd调用ptrace来获取有价值的信息(发生crash之前)。
[-> linker/debugger.cpp]
static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void*) { if (!have_siginfo(signal_number)) { info = nullptr; //SA_SIGINFO标识被意外清空,则info未定义 } //防止debuggerd无法链接时,仍可以输出一些简要signal信息 log_signal_summary(signal_number, info); //建立于debuggerd的socket通信连接 【见小节1.6】 send_debuggerd_packet(info); //重置信号处理函数为SIG_DFL(默认操作) signal(signal_number, SIG_DFL); switch (signal_number) { case SIGABRT: case SIGFPE: case SIGPIPE: #if defined(SIGSTKFLT) case SIGSTKFLT: #endif case SIGTRAP: tgkill(getpid(), gettid(), signal_number); break; default: // SIGILL, SIGBUS, SIGSEGV break; } }
[-> linker/debugger.cpp]
static void send_debuggerd_packet(siginfo_t* info) { // Mutex防止多个crashing线程同一时间来来尝试跟debuggerd进行通信 static pthread_mutex_t crash_mutex = PTHREAD_MUTEX_INITIALIZER; int ret = pthread_mutex_trylock(&crash_mutex); if (ret != 0) { if (ret == EBUSY) { __libc_format_log(ANDROID_LOG_INFO, "libc", "Another thread contacted debuggerd first; not contacting debuggerd."); //等待其他线程释放该锁,从而获取该锁 pthread_mutex_lock(&crash_mutex); } return; } //建立与debuggerd的socket通道 int s = socket_abstract_client(DEBUGGER_SOCKET_NAME, SOCK_STREAM | SOCK_CLOEXEC); ... debugger_msg_t msg; msg.action = DEBUGGER_ACTION_CRASH; msg.tid = gettid(); msg.abort_msg_address = reinterpret_cast<uintptr_t>(g_abort_message); msg.original_si_code = (info != nullptr) ? info->si_code : 0; //将DEBUGGER_ACTION_CRASH消息发送给debuggerd服务端 ret = TEMP_FAILURE_RETRY(write(s, &msg, sizeof(msg))); if (ret == sizeof(msg)) { char debuggerd_ack; //阻塞等待debuggerd服务端的回应数据 ret = TEMP_FAILURE_RETRY(read(s, &debuggerd_ack, 1)); int saved_errno = errno; notify_gdb_of_libraries(); errno = saved_errno; } close(s); }
该方法的主要功能:
action = DEBUGGER_ACTION_CRASH
的消息发送给debuggerd服务端; 接下来,看看debuggerd服务端接收到 DEBUGGER_ACTION_CRASH
的处理流程
debuggerd 守护进程启动后,一直在等待socket client的连接。当native crash发送后便会向debuggerd发送 action = DEBUGGER_ACTION_CRASH
的消息。
[-> /debuggerd/debuggerd.cpp]
static int do_server() { ... for (;;) { sockaddr_storage ss; sockaddr* addrp = reinterpret_cast<sockaddr*>(&ss); socklen_t alen = sizeof(ss); //等待客户端连接 int fd = accept4(s, addrp, &alen, SOCK_CLOEXEC); if (fd == -1) { continue; //accept失败 } //处理native crash发送过来的请求【见小节2.2】 handle_request(fd); } return 0; }
[-> /debuggerd/debuggerd.cpp]
static void handle_request(int fd) { ... //读取client发送过来的请求【见小节3.5】 int status = read_request(fd, &request); ... //fork子进程来处理其余请求命令 pid_t fork_pid = fork(); if (fork_pid == -1) { ALOGE("debuggerd: failed to fork: %s/n", strerror(errno)); } else if (fork_pid == 0) { //子进程执行【见小节2.3】 worker_process(fd, request); } else { //父进程执行【见小节2.4】 monitor_worker_process(fork_pid, request); } }
处于client发送过来的请求,server端通过子进程来处理
[-> /debuggerd/debuggerd.cpp]
static void worker_process(int fd, debugger_request_t& request) { std::string tombstone_path; int tombstone_fd = -1; switch (request.action) { case DEBUGGER_ACTION_CRASH: //打开tombstone文件 tombstone_fd = open_tombstone(&tombstone_path); if (tombstone_fd == -1) { exit(1); //无法打开tombstone文件,则退出该进程 } break; ... } // Attach到目标进程 if (ptrace(PTRACE_ATTACH, request.tid, 0, 0) != 0) { exit(1); //attach失败则退出该进程 } ... //生成backtrace【见小节3.6.2】 std::unique_ptr<BacktraceMap> backtrace_map(BacktraceMap::Create(request.pid)); int amfd = -1; std::unique_ptr<std::string> amfd_data; if (request.action == DEBUGGER_ACTION_CRASH) { //当发生native crash,则连接到AMS【见小节2.3.1】 amfd = activity_manager_connect(); amfd_data.reset(new std::string); } bool succeeded = false; //取消特权模式 if (!drop_privileges()) { _exit(1); //操作失败则退出 } int crash_signal = SIGKILL; //执行dump操作,【见小节2.3.2】 succeeded = perform_dump(request, fd, tombstone_fd, backtrace_map.get(), siblings, &crash_signal, amfd_data.get()); if (!attach_gdb) { //将进程crash情况告知AMS【见小节2.3.3】 activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get()); } //detach目标进程 ptrace(PTRACE_DETACH, request.tid, 0, 0); for (pid_t sibling : siblings) { ptrace(PTRACE_DETACH, sibling, 0, 0); } if (!attach_gdb && request.action == DEBUGGER_ACTION_CRASH) { //发送信号SIGKILL给目标进程[【见小节2.3.4】 if (!send_signal(request.pid, request.tid, crash_signal)) { ALOGE("debuggerd: failed to kill process %d: %s", request.pid, strerror(errno)); } } ... }
整个过程比较复杂,下面只介绍attach_gdb=false的执行流程:
SIGBUS等
致命信号,则 engrave_tombstone
() [-> debuggerd.cpp]
static int activity_manager_connect() { android::base::unique_fd amfd(socket(PF_UNIX, SOCK_STREAM, 0)); if (amfd.get() < -1) { return -1; ///无法连接到ActivityManager(socket失败) } struct sockaddr_un address; memset(&address, 0, sizeof(address)); address.sun_family = AF_UNIX; //该路径必须匹配NativeCrashListener.java中的定义 strncpy(address.sun_path, "/data/system/ndebugsocket", sizeof(address.sun_path)); if (TEMP_FAILURE_RETRY(connect(amfd.get(), reinterpret_cast<struct sockaddr*>(&address), sizeof(address))) == -1) { return -1; //无法连接到ActivityManager(connect失败) } struct timeval tv; memset(&tv, 0, sizeof(tv)); tv.tv_sec = 1; if (setsockopt(amfd.get(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)) == -1) { return -1; //无法连接到ActivityManager(setsockopt SO_SNDTIMEO失败) } tv.tv_sec = 3; if (setsockopt(amfd.get(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) == -1) { return -1; //无法连接到ActivityManager(setsockopt SO_RCVTIMEO失败) } return amfd.release(); }
该方法的功能是建立与 ActivityManager
的socket连接。
根据接收到不同的signal采取相应的操作
[-> debuggerd.cpp]
static bool perform_dump(const debugger_request_t& request, int fd, int tombstone_fd, BacktraceMap* backtrace_map, const std::set<pid_t>& siblings, int* crash_signal, std::string* amfd_data) { if (TEMP_FAILURE_RETRY(write(fd, "/0", 1)) != 1) { return false; //无法响应client端请求 } int total_sleep_time_usec = 0; while (true) { //等待信号到来 int signal = wait_for_signal(request.tid, &total_sleep_time_usec); switch (signal) { ... case SIGABRT: case SIGBUS: case SIGFPE: case SIGILL: case SIGSEGV: #ifdef SIGSTKFLT case SIGSTKFLT: #endif case SIGTRAP: ALOGV("stopped -- fatal signal/n"); *crash_signal = signal; //这是输出tombstone信息最为核心的方法 engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal, request.original_si_code, request.abort_msg_address, amfd_data); break; default: ALOGE("debuggerd: process stopped due to unexpected signal %d/n", signal); break; } break; } return true; }
前一篇文章已介绍过engrave_tombstone的功能内容,此处忽略。
对于以下信号都是致命的信号:
[-> debuggerd.cpp]
static void activity_manager_write(int pid, int signal, int amfd, const std::string& amfd_data) { if (amfd == -1) { return; } //写入pid和signal,以及原始dump信息,最后添加0以标记结束 uint32_t datum = htonl(pid); if (!android::base::WriteFully(amfd, &datum, 4)) { return; //AM pid写入失败 } datum = htonl(signal); if (!android::base::WriteFully(amfd, &datum, 4)) { return;//AM signal写入失败 } if (!android::base::WriteFully(amfd, amfd_data.c_str(), amfd_data.size())) { return;//AM data写入失败 } uint8_t eodMarker = 0; if (!android::base::WriteFully(amfd, &eodMarker, 1)) { return; //AM eod 写入失败 } //读取应答消息,如果3s超时未收到则读取失败 android::base::ReadFully(amfd, &eodMarker, 1); }
此处只是向目标进程发送SIGKILL信号,用于杀掉目标进程,文章理解杀进程的实现原理,已详细讲述过发送SIGKILL信号的处理流程。
[-> SystemServer.java]
private void startOtherServices() { ... mActivityManagerService.systemReady(new Runnable() { @Override public void run() { ... mSystemServiceManager.startBootPhase( SystemService.PHASE_ACTIVITY_MANAGER_READY); try { //【见小节3.2】 mActivityManagerService.startObservingNativeCrashes(); } catch (Throwable e) { reportWtf("observing native crashes", e); } } } }
当开机过程中启动服务启动到阶段PHASE_ACTIVITY_MANAGER_READY(550),即服务可以广播自己的Intents,然后启动native crash的监听进程。
[-> ActivityManagerService.java]
public void startObservingNativeCrashes() { //【见】 final NativeCrashListener ncl = new NativeCrashListener(this); ncl.start(); }
NativeCrashListener继承于 Thread
,是一个独立的线程,调用start方法来启动线程开始工作。
[-> NativeCrashListener.java]
public void run() { final byte[] ackSignal = new byte[1]; { //此处DEBUGGERD_SOCKET_PATH= "/data/system/ndebugsocket" File socketFile = new File(DEBUGGERD_SOCKET_PATH); if (socketFile.exists()) { socketFile.delete(); } } try { FileDescriptor serverFd = Os.socket(AF_UNIX, SOCK_STREAM, 0); //创建socket服务端 final UnixSocketAddress sockAddr = UnixSocketAddress.createFileSystem( DEBUGGERD_SOCKET_PATH); Os.bind(serverFd, sockAddr); Os.listen(serverFd, 1); while (true) { FileDescriptor peerFd = null; try { //等待debuggerd建立连接 peerFd = Os.accept(serverFd, null /* peerAddress */); //获取debuggerd的socket文件描述符 if (peerFd != null) { //只有超级用户才被允许通过该socket进行通信 StructUcred credentials = Os.getsockoptUcred(peerFd, SOL_SOCKET, SO_PEERCRED); if (credentials.uid == 0) { //【见小节3.4】处理native crash信息 consumeNativeCrashData(peerFd); } } } catch (Exception e) { Slog.w(TAG, "Error handling connection", e); } finally { //应答debuggerd已经建立连接 if (peerFd != null) { Os.write(peerFd, ackSignal, 0, 1);//写入应答消息 Os.close(peerFd);//关闭socket ... } } } } catch (Exception e) { Slog.e(TAG, "Unable to init native debug socket!", e); } }
“/data/system/ndebugsocket”文件权限700,owned为system:system,debuggerd是以root权限运行,因此可以与该socket建立连接,但对于第3放app则不能。
[-> NativeCrashListener.java]
void consumeNativeCrashData(FileDescriptor fd) { //进入该方法,标识着debuggerd已经与AMS建立连接 final byte[] buf = new byte[4096]; final ByteArrayOutputStream os = new ByteArrayOutputStream(4096); try { //此处SOCKET_TIMEOUT_MILLIS=2s StructTimeval timeout = StructTimeval.fromMillis(SOCKET_TIMEOUT_MILLIS); Os.setsockoptTimeval(fd, SOL_SOCKET, SO_RCVTIMEO, timeout); Os.setsockoptTimeval(fd, SOL_SOCKET, SO_SNDTIMEO, timeout); //1.读取pid和signal number int headerBytes = readExactly(fd, buf, 0, 8); if (headerBytes != 8) { return; //读取失败 } int pid = unpackInt(buf, 0); int signal = unpackInt(buf, 4); //2.读取dump内容 if (pid > 0) { final ProcessRecord pr; synchronized (mAm.mPidsSelfLocked) { pr = mAm.mPidsSelfLocked.get(pid); } if (pr != null) { //persistent应用,直接忽略 if (pr.persistent) { return; } int bytes; do { //获取数据 bytes = Os.read(fd, buf, 0, buf.length); if (bytes > 0) { if (buf[bytes-1] == 0) { //到达文件EOD, 忽略该字节 os.write(buf, 0, bytes-1); break; } os.write(buf, 0, bytes); } } while (bytes > 0); synchronized (mAm) { pr.crashing = true; pr.forceCrashReport = true; } final String reportString = new String(os.toByteArray(), "UTF-8"); //异常处理native crash报告【见小节3.5】 (new NativeCrashReporter(pr, signal, reportString)).start(); } } } catch (Exception e) { Slog.e(TAG, "Exception dealing with report", e); } }
[-> NativeCrashListener.java]
class NativeCrashReporter extends Thread { public void run() { try { CrashInfo ci = new CrashInfo(); ci.exceptionClassName = "Native crash"; ci.exceptionMessage = Os.strsignal(mSignal); ci.throwFileName = "unknown"; ci.throwClassName = "unknown"; ci.throwMethodName = "unknown"; ci.stackTrace = mCrashReport; //AMS真正处理crash的过程 mAm.handleApplicationCrashInner("native_crash", mApp, mApp.processName, ci); } catch (Exception e) { Slog.e(TAG, "Unable to report native crash", e); } } }
handleApplicationCrashInner位于ActivityManagerService,不论是Native crash还是Java层crash都会调用到该方法,整个上层的crash流程比较复杂, 下一篇文章会再展开说明。
欢迎关注我的微博: Gityuan 。如果觉得我的文章对您所有帮助,请 ¥打赏支持,或者点击下方分享给更多的朋友。您的支持将激励我创作更多技术干货!