{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T09:53:49Z","timestamp":1764842029399},"reference-count":62,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017,7]]},"DOI":"10.1109\/cvpr.2017.447","type":"proceedings-article","created":{"date-parts":[[2017,11,9]],"date-time":"2017-11-09T21:50:33Z","timestamp":1510264233000},"source":"Crossref","is-referenced-by-count":39,"title":["Generating Descriptions with Grounded and Co-referenced People"],"prefix":"10.1109","author":[{"given":"Anna","family":"Rohrbach","sequence":"first","affiliation":[]},{"given":"Marcus","family":"Rohrbach","sequence":"additional","affiliation":[]},{"given":"Siyu","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Seong Joon","family":"Oh","sequence":"additional","affiliation":[]},{"given":"Bernt","family":"Schiele","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.61"},{"key":"ref33","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-319-11752-2_15","article-title":"Coherent multi-sentence video description with variable level of detail","author":"rohrbach","year":"2014","journal-title":"Proc German Conf Pattern Recognit"},{"key":"ref32","article-title":"Grounding of textual phrases in images by reconstruction","author":"rohrbach","year":"2016","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref31","article-title":"Linking people in videos with &#x201C;their&#x201D; names using coreference resolution","author":"ramanathan","year":"2014","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0987-1"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.447"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"ref34","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-319-24947-6_17","article-title":"The long-short story of movie description","author":"rohrbach","year":"2015","journal-title":"Proc German Conf Pattern Recognit"},{"key":"ref60","author":"zaremba","year":"2014","journal-title":"Learning to execute"},{"key":"ref62","author":"zhou","year":"2015","journal-title":"Naive-deep face recognition Touching the limit of LFW benchmark or not?"},{"key":"ref61","article-title":"Learning Deep Features for Scene Recognition using Places Database","author":"zhou","year":"2014","journal-title":"Advances in Neural Information Processing Systems (NIPS)"},{"key":"ref28","article-title":"Deep face recognition","author":"parkhi","year":"2015","journal-title":"Proceedings of the British Machine Vision Conference (BMVC)"},{"key":"ref27","article-title":"It's in the bag: Stronger supervision for automated face labelling","author":"parkhi","year":"2015","journal-title":"Proceedings of the IEEE International Conference on Computer Vision Workshops (ICCV Workshops)"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-11752-2_56"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.3115\/1220175.1220180"},{"key":"ref1","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2015","journal-title":"Proc of the Int Conf on Learning Representations (ICLR)"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.455"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.340"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.27"},{"key":"ref24","article-title":"Attention correctness in neural image captioning","author":"liu","year":"2017","journal-title":"Proceedings of the Conference on Artificial Intelligence (AAAI)"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.5244\/C.29.93"},{"key":"ref26","article-title":"Deep captioning with multimodal recurrent neural networks (m-rnn)","author":"mao","year":"2015","journal-title":"Proc of the Int Conf on Learning Representations (ICLR)"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"ref59","article-title":"Spatiotemporal attention models for grounded video captioning","author":"zanfir","year":"2016","journal-title":"Proceedings of the Asian Conference on Computer Vision (ACCV)"},{"key":"ref58","first-page":"69","article-title":"Modeling context in referring expressions","author":"yu","year":"2016","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.496"},{"key":"ref56","article-title":"Grounded language learning from videos described with sentences","author":"yu","year":"2013","journal-title":"Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL)"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.512"},{"key":"ref53","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"Proceedings of the International Conference on Machine Learning (ICML)"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.541"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.129"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/BF01589097"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2016.7532983"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.337"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref15","article-title":"LSDA: Large scale detection through adaptation","author":"hoffman","year":"2014","journal-title":"Advances in Neural Information Processing Systems (NIPS)"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.493"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1147"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.494"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/BF01581239"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.283"},{"key":"ref6","article-title":"Imagenet: A large-scale hierarchical image database","author":"deng","year":"2009","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206667"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.5244\/C.20.92"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref46","article-title":"knock! knock! who is it?&#x201D; probabilistic person identification in tv-series","author":"tapaswi","year":"2012","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299138"},{"key":"ref48","author":"torabi","year":"2015","journal-title":"Using descriptive video services to create a large data source for video annotation research"},{"key":"ref47","article-title":"Integrating language and vision to generate natural language descriptions of videos in the wild","author":"thomason","year":"2014","journal-title":"Proceedings of the International Conference On Computational Linguistics (COLING)"},{"key":"ref42","article-title":"who are you?&#x201D;-learning person specific classifiers from video","author":"sivic","year":"2009","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"ref41","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2015","journal-title":"Proc of the Int Conf on Learning Representations (ICLR)"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.220"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298907"}],"event":{"name":"2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Honolulu, HI","start":{"date-parts":[[2017,7,21]]},"end":{"date-parts":[[2017,7,26]]}},"container-title":["2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8097368\/8099483\/08099930.pdf?arnumber=8099930","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,28]],"date-time":"2023-08-28T08:30:05Z","timestamp":1693211405000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/8099930\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,7]]},"references-count":62,"URL":"https:\/\/doi.org\/10.1109\/cvpr.2017.447","relation":{},"subject":[],"published":{"date-parts":[[2017,7]]}}}