{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T15:45:21Z","timestamp":1781797521443,"version":"3.54.5"},"reference-count":72,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,3,6]]},"DOI":"10.1109\/wacv61042.2026.00099","type":"proceedings-article","created":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T19:59:32Z","timestamp":1778011172000},"page":"948-958","source":"Crossref","is-referenced-by-count":3,"title":["Do generative video models understand physical principles?"],"prefix":"10.1109","author":[{"given":"Saman","family":"Motamed","sequence":"first","affiliation":[{"name":"Sofia University &#x201C;St. Kliment Ohridski&#x201D;,INSAIT"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Laura","family":"Culp","sequence":"additional","affiliation":[{"name":"Google DeepMind"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kevin","family":"Swersky","sequence":"additional","affiliation":[{"name":"Google DeepMind"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Priyank","family":"Jaini","sequence":"additional","affiliation":[{"name":"Google DeepMind"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Robert","family":"Geirhos","sequence":"additional","affiliation":[{"name":"Google DeepMind"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Cosmos world foundation model platform for physical AI","author":"Agarwal","year":"2025"},{"key":"ref2","article-title":"Learning to poke by poking: Experiential learning of intuitive physics","volume-title":"Advances in neural information processing systems","volume":"29","author":"Agrawal"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.205"},{"key":"ref4","author":"Bai","year":"2025","journal-title":"Impossible videos"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1002\/9780470996652.ch3"},{"key":"ref6","author":"Bansal","year":"2024","journal-title":"Videophy: Evaluating physical commonsense for video generation"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687614"},{"key":"ref8","article-title":"Cophy: Counterfactual learning of physical dynamics","author":"Baradel","year":"2019"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/9780262518420.003.0013"},{"key":"ref10","author":"Bear","year":"2021","journal-title":"Physion: Evaluating physical prediction from vision in humans and machines"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.2218523120"},{"key":"ref12","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023"},{"key":"ref13","article-title":"Physgame: Uncovering physical commonsense violations in gameplay videos","author":"Cao","year":"2024"},{"key":"ref14","article-title":"A compositional object-based approach to learning physical dynamics","author":"Chang","year":"2016"},{"key":"ref15","article-title":"LLMPhy: Complex physical reasoning using large language models and world models","author":"Cherian","year":"2024"},{"key":"ref16","volume-title":"Veo2: Google\u2019s state-of-the-art video generation model","year":"2024"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1080\/02643294.2022.2106126"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1098\/rstb.2005.1622"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00695"},{"key":"ref20","article-title":"General-isation in humans and deep neural networks","volume-title":"Advances in neural information processing systems","volume":"31","author":"Geirhos"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-00257-z"},{"key":"ref22","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","year":"2024"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1037\/0033-295X.111.1.3"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.127"},{"key":"ref25","article-title":"Benchmarking neural network robustness to common corruptions and perturbations","volume-title":"International Conference on Learning Representations","author":"Hendrycks"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1419"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2010.579"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"ref30","article-title":"OpenAI o1 system card","author":"Jaech","year":"2024"},{"key":"ref31","article-title":"GRASP: A novel benchmark for evaluating language grounding and situated physics understanding in multimodal language models","author":"Jassim","year":"2023"},{"key":"ref32","article-title":"How far is video generation from world model: A physical law perspective","author":"Kang","year":"2024"},{"key":"ref33","author":"Kang","year":"2024","journal-title":"How far is video generation from world model: A physical law perspective"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1016\/0010-0285(83)90017-8"},{"key":"ref35","first-page":"25105","article-title":"VideoPoet: A large language model for zero-shot video generation","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Kondratyuk"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1016\/j.tics.2017.06.002"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1017\/S0140525X16001837"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1037\/xge0001439"},{"key":"ref39","doi-asserted-by":"crossref","DOI":"10.32388\/BXC6X1","article-title":"Videohallu: Evaluating and mitigating multi-modal hallucinations on synthetic video understanding","author":"Li","year":"2025"},{"key":"ref40","article-title":"Generative physical AI in vision: A survey","author":"Liu","year":"2025"},{"key":"ref41","article-title":"Inference-time scaling for diffusion models beyond scaling denoising steps","author":"Ma","year":"2025"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1038\/scientificamerican0483-122"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1126\/science.210.4474.1139"},{"key":"ref44","author":"Meng","year":"2024","journal-title":"Towards world simulator: Crafting physical commonsense-based benchmark for video generation"},{"key":"ref45","volume-title":"Meta Movie Gen: AI-powered movie generation","year":"2024"},{"key":"ref46","article-title":"Travl: A recipe for making video-language models better judges of physics implausibility","author":"Motamed","year":"2025"},{"key":"ref47","volume-title":"Sora: OpenAI\u2019s Multimodal Agent","year":"2024"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1038\/s41562-022-01394-8"},{"key":"ref49","first-page":"18","article-title":"Vision language models are blind","volume-title":"Proceedings of the Asian Conference on Computer Vision","author":"Rahmanzadehgervi"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.706"},{"key":"ref51","article-title":"A survey of hallucination in large foundation models","author":"Rawte","year":"2023"},{"key":"ref52","article-title":"IntPhys: A framework and benchmark for visual intuitive physics reasoning","author":"Riochet","year":"2018"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1016\/j.actpsy.2006.05.005"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-024-00963-y"},{"key":"ref55","article-title":"Scaling LLM test-time compute optimally can be more effective than scaling model parameters","author":"Snell","year":"2024"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1016\/j.cognition.2024.105995"},{"key":"ref57","volume-title":"The origins of physical knowledge","author":"Spelke","year":"1988"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1037\/0033-295X.99.4.605"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1111\/j.2044-835X.1995.tb00669.x"},{"key":"ref60","article-title":"Beyond the imitation game: Quantifying and extrapolating the capabilities of language models","author":"Srivastava","year":"2022"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.422"},{"key":"ref62","volume-title":"Pika labs","year":"2024"},{"key":"ref63","volume-title":"Runway","year":"2024"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1126\/science.1192788"},{"key":"ref65","article-title":"Physion++: Evaluating physical scene under-standing that requires online inference of different physical properties","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Tung"},{"key":"ref66","article-title":"Towards accurate generative models of video: A new metric & challenges","author":"Unterthiner","year":"2018"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.3390\/jintelligence11100187"},{"key":"ref68","volume-title":"Handbuch der physiologischen Optik: mit 213 in den Text eingedruckten Holzschnitten und 11 Tafeln","author":"von Helmholtz","year":"1867"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"ref70","article-title":"Benchmarking progress to infant-level physical reasoning in ai","author":"Weihs","year":"2022","journal-title":"Transactions on Machine Learning Research"},{"key":"ref71","article-title":"Clevrer: Collision events for video representation and reasoning","author":"Yi","year":"2019"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"}],"event":{"name":"2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","location":"Tucson, AZ, USA","start":{"date-parts":[[2026,3,6]]},"end":{"date-parts":[[2026,3,10]]}},"container-title":["2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11491838\/11491925\/11492408.pdf?arnumber=11492408","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T05:56:28Z","timestamp":1778046988000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11492408\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,6]]},"references-count":72,"URL":"https:\/\/doi.org\/10.1109\/wacv61042.2026.00099","relation":{},"subject":[],"published":{"date-parts":[[2026,3,6]]}}}