Post

Replies

Boosts

Views

Activity

Kernel panic when using fclonefileat from ES
Hi, I am developing instant snapshot backup solution for macOS using Endpoint Security. We have stumbled upon a Kernel Panic when using "fclonefileat" API. We are catching a kernel panic on customer machines when attempting to clone the file during ES sync callback: panic(cpu 0 caller 0xfffffe002c495508): "apfs_io_lock_exclusive : Recursive exclusive lock attempt" @fs_utils.c:435 I have symbolized the backtrace to know it is related to clone operation with the following backtrace: apfs_io_lock_exclusive apfs_clone_internal apfs_vnop_clonefile I made a minimal repro that boils down to the following operations: apfs_crash_stress - launch thread to do rsrc writes static void *rsrc_write_worker(void *arg) { int id = (int)(long)arg; char buf[8192]; long n = 0; fill_pattern(buf, sizeof(buf), 'W' + id); while (n < ITERATION_LIMIT) { int file_idx = n % NUM_SOURCE_FILES; int fd = open(g_src_rsrc[file_idx], O_WRONLY | O_CREAT, 0644); if (fd >= 0) { off_t off = ((n * 4096) % RSRC_DATA_SIZE); pwrite(fd, buf, sizeof(buf), off); if ((n & 0x7) == 0) fsync(fd); close(fd); } else { setxattr(g_src[file_idx], "com.apple.ResourceFork", buf, sizeof(buf), 0, 0); } n++; } printf("[rsrc_wr_%d] done (%ld ops)\n", id, n); return NULL; } apfs_crash_es - simple ES client that is cloning the file (error checking omitted for brevity) static std::string volfsPath(uint64_t devId, uint64_t vnodeId) { return "/.vol/" + std::to_string(devId) + "/" + std::to_string(vnodeId); } static void cloneAndScheduleDelete(const std::string& sourcePath, dispatch_queue_t queue, uint64_t devId, uint64_t vnodeId) { struct stat st; if (stat(sourcePath.c_str(), &st) != 0 || !S_ISREG(st.st_mode)) return; int srcFd = open(sourcePath.c_str(), O_RDONLY); const char* cloneDir = "/Users/admin/Downloads/_clone"; mkdir(cloneDir, 0755); const char* filename = strrchr(sourcePath.c_str(), '/'); filename = filename ? filename + 1 : sourcePath.c_str(); std::string cloneFilename = std::string(filename) + ".clone." + std::to_string(time(nullptr)) + "." + std::to_string(getpid()); std::string clonePath = std::string(cloneDir) + "/" + cloneFilename; fclonefileat(srcFd, AT_FDCWD, clonePath.c_str(), 0); { dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 1 * NSEC_PER_SEC), queue, ^{ if (unlink(clonePath.c_str()) == 0) { LOG("Deleted clone: %s", clonePath.c_str()); } else { LOG("Failed to delete clone: %s", clonePath.c_str()); } }); } close(srcFd); } static const es_file_t* file(const es_message_t* msg) { switch (msg->event_type) { case ES_EVENT_TYPE_AUTH_OPEN: return msg->event.open.file; case ES_EVENT_TYPE_AUTH_EXEC: return msg->event.exec.target->executable; case ES_EVENT_TYPE_AUTH_RENAME: return msg->event.rename.source; } return nullptr; } int main(void) { es_client_t* cli; auto ret = es_new_client(&cli, ^(es_client_t* client, const es_message_t * msgc) { if (msgc->process->is_es_client) { es_mute_process(client, &msgc->process->audit_token); return respond(client, msgc, true); } dispatch_async(esQueue, ^{ bool shouldClone = false; if (msgc->event_type == ES_EVENT_TYPE_AUTH_OPEN) { auto& ev = msgc->event.open; if (ev.fflag & (FWRITE | O_RDWR | O_WRONLY | O_TRUNC | O_APPEND)) { shouldClone = true; } } else if (msgc->event_type == ES_EVENT_TYPE_AUTH_UNLINK || msgc->event_type == ES_EVENT_TYPE_AUTH_RENAME) { shouldClone = true; } if (shouldClone) { if (auto f = ::file(msgc)) cloneAndScheduleDelete(f->path.data, cloneQueue, f->stat.st_dev, f->stat.st_ino); } respond(client, msgc, true); }); }); LOG("es_new_client -> %d", ret); es_event_type_t events[] = { ES_EVENT_TYPE_AUTH_OPEN, ES_EVENT_TYPE_AUTH_EXEC, ES_EVENT_TYPE_AUTH_RENAME, ES_EVENT_TYPE_AUTH_UNLINK, }; es_subscribe(cli, events, sizeof(events) / sizeof(*events)); } Create 2 terminal sessions and run the following commands: % sudo ./apfs_crash_es % sudo ./apfs_crash_stress ~/Downloads/test/ Machine will very quickly panic due to APFS deadlock. I expect that no userspace syscall should be able to cause kernel panic. It looks like a bug in APFS implementation and requires fix on XNU/kext side. We were able to reproduce this issue on macOS 26.3.1/15.6.1 on Intel/ARM machines. Here is the panic string: panic_string.txt Source code without XCode project: apfs_crash_es.cpp apfs_crash_stress.cpp Full XCode project + full panic is available at https://www.icloud.com/iclouddrive/0f215KkZffPOTLpETPo-LdaXw#apfs%5Fcrash%5Fes
1
0
48
3h
Kernel panic when using fclonefileat from ES
Hi, I am developing instant snapshot backup solution for macOS using Endpoint Security. We have stumbled upon a Kernel Panic when using "fclonefileat" API. We are catching a kernel panic on customer machines when attempting to clone the file during ES sync callback: panic(cpu 0 caller 0xfffffe002c495508): "apfs_io_lock_exclusive : Recursive exclusive lock attempt" @fs_utils.c:435 I have symbolized the backtrace to know it is related to clone operation with the following backtrace: apfs_io_lock_exclusive apfs_clone_internal apfs_vnop_clonefile I made a minimal repro that boils down to the following operations: apfs_crash_stress - launch thread to do rsrc writes static void *rsrc_write_worker(void *arg) { int id = (int)(long)arg; char buf[8192]; long n = 0; fill_pattern(buf, sizeof(buf), 'W' + id); while (n < ITERATION_LIMIT) { int file_idx = n % NUM_SOURCE_FILES; int fd = open(g_src_rsrc[file_idx], O_WRONLY | O_CREAT, 0644); if (fd >= 0) { off_t off = ((n * 4096) % RSRC_DATA_SIZE); pwrite(fd, buf, sizeof(buf), off); if ((n & 0x7) == 0) fsync(fd); close(fd); } else { setxattr(g_src[file_idx], "com.apple.ResourceFork", buf, sizeof(buf), 0, 0); } n++; } printf("[rsrc_wr_%d] done (%ld ops)\n", id, n); return NULL; } apfs_crash_es - simple ES client that is cloning the file (error checking omitted for brevity) static std::string volfsPath(uint64_t devId, uint64_t vnodeId) { return "/.vol/" + std::to_string(devId) + "/" + std::to_string(vnodeId); } static void cloneAndScheduleDelete(const std::string& sourcePath, dispatch_queue_t queue, uint64_t devId, uint64_t vnodeId) { struct stat st; if (stat(sourcePath.c_str(), &st) != 0 || !S_ISREG(st.st_mode)) return; int srcFd = open(sourcePath.c_str(), O_RDONLY); const char* cloneDir = "/Users/admin/Downloads/_clone"; mkdir(cloneDir, 0755); const char* filename = strrchr(sourcePath.c_str(), '/'); filename = filename ? filename + 1 : sourcePath.c_str(); std::string cloneFilename = std::string(filename) + ".clone." + std::to_string(time(nullptr)) + "." + std::to_string(getpid()); std::string clonePath = std::string(cloneDir) + "/" + cloneFilename; fclonefileat(srcFd, AT_FDCWD, clonePath.c_str(), 0); { dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 1 * NSEC_PER_SEC), queue, ^{ if (unlink(clonePath.c_str()) == 0) { LOG("Deleted clone: %s", clonePath.c_str()); } else { LOG("Failed to delete clone: %s", clonePath.c_str()); } }); } close(srcFd); } static const es_file_t* file(const es_message_t* msg) { switch (msg->event_type) { case ES_EVENT_TYPE_AUTH_OPEN: return msg->event.open.file; case ES_EVENT_TYPE_AUTH_EXEC: return msg->event.exec.target->executable; case ES_EVENT_TYPE_AUTH_RENAME: return msg->event.rename.source; } return nullptr; } int main(void) { es_client_t* cli; auto ret = es_new_client(&cli, ^(es_client_t* client, const es_message_t * msgc) { if (msgc->process->is_es_client) { es_mute_process(client, &msgc->process->audit_token); return respond(client, msgc, true); } dispatch_async(esQueue, ^{ bool shouldClone = false; if (msgc->event_type == ES_EVENT_TYPE_AUTH_OPEN) { auto& ev = msgc->event.open; if (ev.fflag & (FWRITE | O_RDWR | O_WRONLY | O_TRUNC | O_APPEND)) { shouldClone = true; } } else if (msgc->event_type == ES_EVENT_TYPE_AUTH_UNLINK || msgc->event_type == ES_EVENT_TYPE_AUTH_RENAME) { shouldClone = true; } if (shouldClone) { if (auto f = ::file(msgc)) cloneAndScheduleDelete(f->path.data, cloneQueue, f->stat.st_dev, f->stat.st_ino); } respond(client, msgc, true); }); }); LOG("es_new_client -> %d", ret); es_event_type_t events[] = { ES_EVENT_TYPE_AUTH_OPEN, ES_EVENT_TYPE_AUTH_EXEC, ES_EVENT_TYPE_AUTH_RENAME, ES_EVENT_TYPE_AUTH_UNLINK, }; es_subscribe(cli, events, sizeof(events) / sizeof(*events)); } Create 2 terminal sessions and run the following commands: % sudo ./apfs_crash_es % sudo ./apfs_crash_stress ~/Downloads/test/ Machine will very quickly panic due to APFS deadlock. I expect that no userspace syscall should be able to cause kernel panic. It looks like a bug in APFS implementation and requires fix on XNU/kext side. We were able to reproduce this issue on macOS 26.3.1/15.6.1 on Intel/ARM machines. Here is the panic string: panic_string.txt Source code without XCode project: apfs_crash_es.cpp apfs_crash_stress.cpp Full XCode project + full panic is available at https://www.icloud.com/iclouddrive/0f215KkZffPOTLpETPo-LdaXw#apfs%5Fcrash%5Fes
Replies
1
Boosts
0
Views
48
Activity
3h