@inproceedings{61da83ace82d40aaae4e40b0c7a68368,
title = "A lightweight approach to GPU resilience",
abstract = "Resilience for HPC applications typically is implemented as a CPU-based rollback-recovery technique. In this context, long running accelerator computations on GPUs pose a major challenge as these devices usually do not offer any means of interrupt. This paper proposes a solution to the aforementioned problem: it suggests a novel approach that rewrites GPU kernels so that a soft interrupt of their execution becomes possible. Our approach is based on the Compute Unified Device Architecture (CUDA) by Nvidia and works by taking advantage of CUDA{\textquoteright}s execution model of partitioning threads into blocks. In essence, we re-write the kernel so that each block determines whether it should continue execution or return control to the CPU. By doing so we are able to perform a premature interrupt of kernels.",
keywords = "GPU, HPC, Resilience",
author = "Max Baird and Christian Fensch and Sven-Bodo Scholz and Artjoms {\v S}inkarovs",
year = "2018",
month = dec,
day = "31",
doi = "10.1007/978-3-030-10549-5_64",
language = "English",
isbn = "9783030105488",
series = "Lecture Notes in Computer Science",
publisher = "Springer",
pages = "826--838",
editor = "Gabriele Mencagli and Heras, {Dora B.}",
booktitle = "Euro-Par 2018",
note = "24th International Conference on Parallel and Distributed Computing 2018, Euro-Par 2018 ; Conference date: 27-08-2018 Through 28-08-2018",
}