Article Instance
API Endpoint for journals.
GET /api/articles/49723/?format=api
{ "pk": 49723, "title": "Bridging Perception and Language: A Systematic Benchmark for LVLMs' Understanding of Amodal Completion Reports", "subtitle": null, "abstract": "One of the main objectives in developing large vision-language models (LVLMs) is to engineer systems that can assist humans with multimodal tasks, including interpreting descriptions of perceptual experiences. A central phenomenon in this context is amodal completion, in which people perceive objects even when parts of those objects are hidden. Although numerous studies have assessed whether computer-vision algorithms can detect or reconstruct occluded regions, the inferential abilities of LVLMs on texts related to amodal completion remain unexplored. To address this gap, we constructed a benchmark grounded in Basic Formal Ontology to achieve a systematic classification of amodal completion. Our results indicate that while many LVLMs achieve human-comparable performance overall, their accuracy diverges for certain types of objects being completed. Notably, in certain categories, some LLaVA-NeXT variants and Claude 3.5 Sonnet exhibit lower accuracy on original images compared to blank stimuli lacking visual content. Intriguingly, this disparity emerges only under Japanese prompting, suggesting a deficiency in Japanese-specific linguistic competence among these models.", "language": "eng", "license": { "name": "", "short_name": "", "text": null, "url": "" }, "keywords": [ { "word": "Computer Science; Linguistics; Philosophy; Natural Language Processing; Perception; Semantics of language" } ], "section": "Papers with Poster Presentation", "is_remote": true, "remote_url": "https://escholarship.org/uc/item/2qd160dz", "frozenauthors": [ { "first_name": "Amane", "middle_name": "", "last_name": "Watahiki", "name_suffix": "", "institution": "The University of Tokyo", "department": "" }, { "first_name": "Tomoki", "middle_name": "", "last_name": "Doi", "name_suffix": "", "institution": "The University of Tokyo", "department": "" }, { "first_name": "Taiga", "middle_name": "", "last_name": "Shinozaki", "name_suffix": "", "institution": "Keio University", "department": "" }, { "first_name": "Satoshi", "middle_name": "", "last_name": "Nishida", "name_suffix": "", "institution": "National Institute of Information and Communications Technology", "department": "" }, { "first_name": "takuya", "middle_name": "", "last_name": "niikawa", "name_suffix": "", "institution": "Kobe University", "department": "" }, { "first_name": "Katsunori", "middle_name": "", "last_name": "Miyahara", "name_suffix": "", "institution": "Hokkaido University", "department": "" }, { "first_name": "Hitomi", "middle_name": "", "last_name": "Yanaka", "name_suffix": "", "institution": "the University of Tokyo", "department": "" } ], "date_submitted": null, "date_accepted": null, "date_published": "2025-01-01T18:00:00Z", "render_galley": null, "galleys": [ { "label": "PDF", "type": "pdf", "path": "https://journalpub.escholarship.org/cognitivesciencesociety/article/49723/galley/37685/download/" } ] }