@inproceedings{cb73b5660ef0468e8d1cec3de736542e,
title = "Towards Real-Time CNN Inference from a Video Stream on a Mobile GPU (WiP Paper)",
abstract = "While there are several frameworks for CNN inference on mobile GPUs, they do not achieve real-Time processing for the most of the CNNs that aim at reasonable accuracy since they all employ kernel-by-kernel execution model and do not effectively support INT8 quantization yet. In this paper, we reveal that mobile GPUs suffer from large kernel launch overhead unlike server GPUs, and then propose an on-device deep learning inference framework that can achieve real-Time inference of CNNs on mobile GPUs by removing kernel launch overhead and by effectively exploiting INT8 quantization. We have evaluated the proposed framework with a state-of-The-Art CNN based face detector (RetinaFace), and observed up to 2.01X of speedup compared to ARM Compute Library (ACL) on a commodity smartphone.",
keywords = "face detection, on-device deep learning, persistent threads, quantization",
author = "Chanyoung Oh and Gunju Park and Sumin Kim and Dohee Kim and Youngmin Yi",
note = "Publisher Copyright: {\textcopyright} 2020 ACM.; 21st ACM SIGPLAN/SIGBED Conference on Languages, Compilers, and Tools for Embedded Systems, LCTES 2020 ; Conference date: 16-06-2020",
year = "2020",
month = jun,
day = "16",
doi = "10.1145/3372799.3394366",
language = "English",
series = "Proceedings of the ACM SIGPLAN Conference on Languages, Compilers, and Tools for Embedded Systems (LCTES)",
publisher = "Association for Computing Machinery",
pages = "136--140",
booktitle = "LCTES 2020 - 21st ACM SIGPLAN/SIGBED Conference on Languages, Compilers, and Tools for Embedded Systems",
}