Hi, I am developing instant snapshot backup solution for macOS using Endpoint Security. We have stumbled upon a Kernel Panic when using "fclonefileat" API.
We are catching a kernel panic on customer machines when attempting to clone the file during ES sync callback:
panic(cpu 0 caller 0xfffffe002c495508): "apfs_io_lock_exclusive : Recursive exclusive lock attempt" @fs_utils.c:435
I have symbolized the backtrace to know it is related to clone operation with the following backtrace:
apfs_io_lock_exclusive
apfs_clone_internal
apfs_vnop_clonefile
I made a minimal repro that boils down to the following operations:
apfs_crash_stress - launch thread to do rsrc writes
static void *rsrc_write_worker(void *arg)
{
int id = (int)(long)arg;
char buf[8192];
long n = 0;
fill_pattern(buf, sizeof(buf), 'W' + id);
while (n < ITERATION_LIMIT) {
int file_idx = n % NUM_SOURCE_FILES;
int fd = open(g_src_rsrc[file_idx], O_WRONLY | O_CREAT, 0644);
if (fd >= 0) {
off_t off = ((n * 4096) % RSRC_DATA_SIZE);
pwrite(fd, buf, sizeof(buf), off);
if ((n & 0x7) == 0)
fsync(fd);
close(fd);
} else {
setxattr(g_src[file_idx], "com.apple.ResourceFork",
buf, sizeof(buf), 0, 0);
}
n++;
}
printf("[rsrc_wr_%d] done (%ld ops)\n", id, n);
return NULL;
}
apfs_crash_es - simple ES client that is cloning the file (error checking omitted for brevity)
static std::string volfsPath(uint64_t devId, uint64_t vnodeId)
{
return "/.vol/" + std::to_string(devId) + "/" + std::to_string(vnodeId);
}
static void cloneAndScheduleDelete(const std::string& sourcePath, dispatch_queue_t queue, uint64_t devId, uint64_t vnodeId)
{
struct stat st;
if (stat(sourcePath.c_str(), &st) != 0 || !S_ISREG(st.st_mode))
return;
int srcFd = open(sourcePath.c_str(), O_RDONLY);
const char* cloneDir = "/Users/admin/Downloads/_clone";
mkdir(cloneDir, 0755);
const char* filename = strrchr(sourcePath.c_str(), '/');
filename = filename ? filename + 1 : sourcePath.c_str();
std::string cloneFilename = std::string(filename) + ".clone." + std::to_string(time(nullptr)) + "." + std::to_string(getpid());
std::string clonePath = std::string(cloneDir) + "/" + cloneFilename;
fclonefileat(srcFd, AT_FDCWD, clonePath.c_str(), 0);
{
dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 1 * NSEC_PER_SEC), queue, ^{
if (unlink(clonePath.c_str()) == 0)
{
LOG("Deleted clone: %s", clonePath.c_str());
}
else
{
LOG("Failed to delete clone: %s", clonePath.c_str());
}
});
}
close(srcFd);
}
static const es_file_t* file(const es_message_t* msg)
{
switch (msg->event_type)
{
case ES_EVENT_TYPE_AUTH_OPEN:
return msg->event.open.file;
case ES_EVENT_TYPE_AUTH_EXEC:
return msg->event.exec.target->executable;
case ES_EVENT_TYPE_AUTH_RENAME:
return msg->event.rename.source;
}
return nullptr;
}
int main(void)
{
es_client_t* cli;
auto ret = es_new_client(&cli, ^(es_client_t* client, const es_message_t * msgc)
{
if (msgc->process->is_es_client)
{
es_mute_process(client, &msgc->process->audit_token);
return respond(client, msgc, true);
}
dispatch_async(esQueue, ^{
bool shouldClone = false;
if (msgc->event_type == ES_EVENT_TYPE_AUTH_OPEN)
{
auto& ev = msgc->event.open;
if (ev.fflag & (FWRITE | O_RDWR | O_WRONLY | O_TRUNC | O_APPEND))
{
shouldClone = true;
}
}
else if (msgc->event_type == ES_EVENT_TYPE_AUTH_UNLINK || msgc->event_type == ES_EVENT_TYPE_AUTH_RENAME)
{
shouldClone = true;
}
if (shouldClone)
{
if (auto f = ::file(msgc))
cloneAndScheduleDelete(f->path.data, cloneQueue, f->stat.st_dev, f->stat.st_ino);
}
respond(client, msgc, true);
});
});
LOG("es_new_client -> %d", ret);
es_event_type_t events[] = {
ES_EVENT_TYPE_AUTH_OPEN,
ES_EVENT_TYPE_AUTH_EXEC,
ES_EVENT_TYPE_AUTH_RENAME,
ES_EVENT_TYPE_AUTH_UNLINK,
};
es_subscribe(cli, events, sizeof(events) / sizeof(*events));
}
Create 2 terminal sessions and run the following commands:
% sudo ./apfs_crash_es
% sudo ./apfs_crash_stress ~/Downloads/test/
Machine will very quickly panic due to APFS deadlock. I expect that no userspace syscall should be able to cause kernel panic. It looks like a bug in APFS implementation and requires fix on XNU/kext side.
We were able to reproduce this issue on macOS 26.3.1/15.6.1 on Intel/ARM machines.
Here is the panic string:
panic_string.txt
Source code without XCode project:
apfs_crash_es.cpp
apfs_crash_stress.cpp
Full XCode project + full panic is available at https://www.icloud.com/iclouddrive/0f215KkZffPOTLpETPo-LdaXw#apfs%5Fcrash%5Fes
1
0
48