@inproceedings{3a96e9d08fad4505af406f17626b283b,
title = "A study of evaluation metrics and datasets for video captioning",
abstract = "With the fast growing interest in deep learning, various applications and machine learning tasks are emerged in recent years. Video captioning is especially gaining a lot of attention from both computer vision and natural language processing fields. Generating captions is usually performed by jointly learning of different types of data modalities that share common themes in the video. Learning with the joining representations of different modalities is very challenging due to the inherent heterogeneity resided in the mixed information of visual scenes, speech dialogs, music and sounds, and etc. Consequently, it is hard to evaluate the quality of video captioning results. In this paper, we introduce well-known metrics and datasets for evaluation of video captioning. We compare the the existing metrics and datasets to derive a new research proposal for the evaluation of video descriptions.",
keywords = "Benchmark dataseis, Movie captioning, Video captioning, Video to text",
author = "Jaehui Park and Chibon Song and Han, {Ji Hyeong}",
note = "Publisher Copyright: {\textcopyright} 2017 IEEE.; 2nd International Conference on Intelligent Informatics and Biomedical Sciences, ICIIBMS 2017 ; Conference date: 24-11-2017 Through 26-11-2017",
year = "2017",
month = jul,
day = "2",
doi = "10.1109/ICIIBMS.2017.8279760",
language = "English",
series = "ICIIBMS 2017 - 2nd International Conference on Intelligent Informatics and Biomedical Sciences",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "172--175",
booktitle = "ICIIBMS 2017 - 2nd International Conference on Intelligent Informatics and Biomedical Sciences",
address = "United States",
}