<?xml version="1.0" encoding="US-ASCII"?>
<dblp>
<inproceedings key="conf/emnlp/FangMSCWSNT24" mdate="2025-06-13">
<author>Yuwei Fang</author>
<author>Willi Menapace</author>
<author>Aliaksandr Siarohin</author>
<author>Tsai-Shien Chen</author>
<author>Kuan-Chieh Wang</author>
<author>Ivan Skorokhodov</author>
<author>Graham Neubig</author>
<author>Sergey Tulyakov</author>
<title>VIMI: Grounding Video Generation through Multi-modal Instruction.</title>
<pages>4444-4456</pages>
<year>2024</year>
<booktitle>EMNLP</booktitle>
<ee type="oa">https://doi.org/10.18653/v1/2024.emnlp-main.254</ee>
<ee type="oa">https://aclanthology.org/2024.emnlp-main.254</ee>
<crossref>conf/emnlp/2024</crossref>
<url>db/conf/emnlp/emnlp2024.html#FangMSCWSNT24</url>
</inproceedings>
</dblp>
