Article Instance
API Endpoint for journals.
GET /api/articles/49347/?format=api
{ "pk": 49347, "title": "Multimodal Pragmatic Inference in Vision-Language Transformers", "subtitle": null, "abstract": "Contemporary transformer models have achieved human-like performance on many text-based tasks. However, real-world communication requires the integration of language with non-linguistic context (e.g., visual, social, etc.). Here, we study such information integration in three multimodal transformer models. We test these models' pragmatic capabilities regarding referring expressions: when an object set contains two exemplars from the same category that differ in size, unambiguously referring to one of them requires a size adjective (e.g., the big hammer); the adjective is unnecessary if only one exemplar from the category is present. We evaluate these inferences when models process text-image inputs (via their surprisal for infelicitous vs. felicitous adjective use) and when they generate open-ended descriptions of images given text prompts. We find evidence for pragmatic integration of visual and linguistic context in all models. However, these inferences remain sensitive to the in-context statistics of visual inputs, unlike pragmatic inference in humans.", "language": "eng", "license": { "name": "", "short_name": "", "text": null, "url": "" }, "keywords": [ { "word": "Artificial Intelligence; Language Comprehension; Language Production; Pragmatics; Predictive Processing" } ], "section": "Papers with Poster Presentation", "is_remote": true, "remote_url": "https://escholarship.org/uc/item/5pf870ff", "frozenauthors": [ { "first_name": "Thomas", "middle_name": "A.", "last_name": "McGee", "name_suffix": "", "institution": "University of California, Los Angeles", "department": "" }, { "first_name": "Meng", "middle_name": "", "last_name": "Du", "name_suffix": "", "institution": "UCLA", "department": "" }, { "first_name": "Megan", "middle_name": "", "last_name": "Jacob", "name_suffix": "", "institution": "University of California, Los Angeles", "department": "" }, { "first_name": "Idan", "middle_name": "A", "last_name": "Blank", "name_suffix": "", "institution": "University of California, Los Angeles", "department": "" } ], "date_submitted": null, "date_accepted": null, "date_published": "2025-01-01T15:00:00-03:00", "render_galley": null, "galleys": [ { "label": "PDF", "type": "pdf", "path": "https://journalpub.escholarship.org/cognitivesciencesociety/article/49347/galley/37308/download/" } ] }