@inproceedings{3e58db430145451e96347c204a963aed,
title = "Checkpointing Kernel Executions of MPI+CUDA Applications",
abstract = "This paper proposes a new approach to checkpointing MPI applications that use long-running CUDA kernels. It becomes possible to take snapshots of data residing on the GPUs without waiting for kernels to complete. The proposed technique is implemented in the context of the state of the art high performance fault tolerance library FTI. As a result we get an elegant solution to the problem of developing resilient MPI applications where GPU kernels run longer than the mean time between hardware failures. We describe in detail how we checkpoint/restart collaborative MPI-CUDA applications, and we provide an initial evaluation of the proposed approach using the Livermore Unstructured Lagrangian Explicit Shock Hydrodynamics (LULESH) application as a case study.",
keywords = "Checkpoints, GPU, HPC, MPI, Resilience, Snapshots",
author = "Max Baird and Sven-Bodo Scholz and Artjoms {\v S}inkarovs and Leonardo Bautista-Gomez",
year = "2020",
doi = "10.1007/978-3-030-48340-1_53",
language = "English",
isbn = "9783030483395",
series = "Lecture Notes in Computer Science",
publisher = "Springer",
pages = "694--706",
booktitle = "Euro-Par 2019: Parallel Processing Workshops. Euro-Par 2019",
note = "25th International European Conference on Parallel and Distributed Computing 2019, xiWAT 2020 ; Conference date: 26-08-2019 Through 30-08-2019",
}