{"id":"https://openalex.org/W4412888875","doi":"https://doi.org/10.18653/v1/2025.findings-acl.249","title":"Why Vision Language Models Struggle with Visual Arithmetic? Towards Enhanced Chart and Geometry Understanding","display_name":"Why Vision Language Models Struggle with Visual Arithmetic? Towards Enhanced Chart and Geometry Understanding","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412888875","doi":"https://doi.org/10.18653/v1/2025.findings-acl.249"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.findings-acl.249","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.249","pdf_url":"https://aclanthology.org/2025.findings-acl.249.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.findings-acl.249.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5017859556","display_name":"Kung-Hsiang Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kung-Hsiang Huang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113407743","display_name":"Can Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Can Qin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066211736","display_name":"Haoyi Qiu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Haoyi Qiu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050818189","display_name":"Philippe Laban","orcid":"https://orcid.org/0000-0001-9685-3961"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Philippe Laban","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005443526","display_name":"Shafiq Joty","orcid":"https://orcid.org/0000-0002-9222-2641"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shafiq Joty","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032046813","display_name":"Caiming Xiong","orcid":"https://orcid.org/0000-0003-0349-8628"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Caiming Xiong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5066791810","display_name":"Chien-Sheng Wu","orcid":"https://orcid.org/0000-0002-5598-5324"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chien-Sheng Wu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.4863,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.94341439,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"4830","last_page":"4843"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12694","display_name":"Categorization, perception, and language","score":0.7904000282287598,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12694","display_name":"Categorization, perception, and language","score":0.7904000282287598,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11596","display_name":"Constraint Satisfaction and Optimization","score":0.7681999802589417,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.7671999931335449,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/chart","display_name":"Chart","score":0.6943906545639038},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6537335515022278},{"id":"https://openalex.org/keywords/arithmetic","display_name":"Arithmetic","score":0.5557934045791626},{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.44669976830482483},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.4399193525314331},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4197794198989868},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3948112726211548},{"id":"https://openalex.org/keywords/geometry","display_name":"Geometry","score":0.33312684297561646},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.21223056316375732},{"id":"https://openalex.org/keywords/visual-arts","display_name":"Visual arts","score":0.18508505821228027},{"id":"https://openalex.org/keywords/art","display_name":"Art","score":0.09154900908470154}],"concepts":[{"id":"https://openalex.org/C190812933","wikidata":"https://www.wikidata.org/wiki/Q28923","display_name":"Chart","level":2,"score":0.6943906545639038},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6537335515022278},{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.5557934045791626},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.44669976830482483},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.4399193525314331},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4197794198989868},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3948112726211548},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.33312684297561646},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.21223056316375732},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.18508505821228027},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.09154900908470154},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.findings-acl.249","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.249","pdf_url":"https://aclanthology.org/2025.findings-acl.249.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.findings-acl.249","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.249","pdf_url":"https://aclanthology.org/2025.findings-acl.249.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6499999761581421,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412888875.pdf","grobid_xml":"https://content.openalex.org/works/W4412888875.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2506141842","https://openalex.org/W2484873629","https://openalex.org/W2576621680","https://openalex.org/W348109127","https://openalex.org/W4206130480","https://openalex.org/W2261842017","https://openalex.org/W1969093315","https://openalex.org/W1980921145","https://openalex.org/W2994651827","https://openalex.org/W2008752256"],"abstract_inverted_index":{"Vision":[0],"Language":[1],"Models":[2],"(VLMs)":[3],"have":[4],"achieved":[5],"remarkable":[6],"progress":[7],"in":[8,162],"multimodal":[9],"tasks,":[10],"yet":[11],"they":[12],"often":[13,76],"struggle":[14],"with":[15],"visual":[16,60,108,165],"arithmetic,":[17],"seemingly":[18],"simple":[19],"capabilities":[20,167],"like":[21,34],"object":[22],"counting":[23],"or":[24,142],"length":[25],"comparison,":[26],"which":[27],"are":[28],"essential":[29],"for":[30,82],"relevant":[31],"complex":[32],"tasks":[33,56],"chart":[35],"understanding":[36],"and":[37,137,158,168],"geometric":[38],"reasoning.In":[39],"this":[40,49,112],"work,":[41],"we":[42,87],"first":[43],"investigate":[44],"the":[45,73,116,156],"root":[46],"causes":[47],"of":[48,54,98,118,133,160],"deficiency":[50],"through":[51],"a":[52,90],"suite":[53],"probing":[55,125],"focusing":[57],"on":[58,122,135,139],"basic":[59],"arithmetic.Our":[61],"analysis":[62],"reveals":[63],"that":[64,111],"while":[65,147],"pre-trained":[66],"vision":[67],"encoders":[68],"typically":[69],"capture":[70],"sufficient":[71],"information,":[72],"text":[74],"decoder":[75],"fails":[77],"to":[78,103,171],"decode":[79],"it":[80],"correctly":[81],"arithmetic":[83,166],"reasoning.To":[84],"address":[85],"this,":[86],"propose":[88],"COGALIGN,":[89],"novel":[91],"post-training":[92],"strategy":[93],"inspired":[94],"by":[95,130],"Piaget's":[96],"theory":[97],"cognitive":[99],"development.COGALIGN":[100],"trains":[101],"VLMs":[102,121],"recognize":[104],"invariant":[105],"properties":[106],"under":[107],"transformations.We":[109],"demonstrate":[110],"approach":[113],"significantly":[114],"improves":[115],"performance":[117,129],"three":[119],"diverse":[120],"our":[123],"proposed":[124],"tasks.Furthermore,":[126],"COGALIGN":[127,161],"enhances":[128],"an":[131],"average":[132],"4.6%":[134],"CHOCO-LATE":[136],"2.9%":[138],"MATH-VISION,":[140],"outperforming":[141],"matching":[143],"supervised":[144],"fine-tuning":[145],"methods":[146],"requiring":[148],"only":[149],"60%":[150],"less":[151],"training":[152],"data.These":[153],"results":[154],"highlight":[155],"effectiveness":[157],"generalizability":[159],"improving":[163],"fundamental":[164],"their":[169],"transfer":[170],"downstream":[172],"tasks.":[173]},"counts_by_year":[{"year":2026,"cited_by_count":3}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
