Article Instance
API Endpoint for journals.
GET /api/articles/50050/?format=api
{ "pk": 50050, "title": "Self-Persuasion: A Novel Cognitive Approach to Effective LLM Jailbreaking", "subtitle": null, "abstract": "Large Language Models (LLMs) have been proven useful for various tasks but remain vulnerable to malicious exploitation. Attackers can bypass LLM safety restrictions (\"jail\") through carefully crafted \"jailbreaking\" prompts. To evaluate LLMs' security, researchers proposed various jailbreak techniques based on optimization, obfuscation, or persuasive strategies. However, these methods treat LLMs as passive persuasion targets, which overlooks LLMs' ability to reason actively. We propose Persu-Agent, a novel jailbreak framework based on Greenwald's Cognitive Response Theory. We focus more on LLM's internal cognitive processing of a prompt than the prompt itself. Persu-Agent uses the self-persuasion strategy to guide LLMs in generating justifications and rationalizing responses to harmful queries. The experimental results on advanced open-source and commercial LLMs revealed that Persu-Agent achieved an average jailbreak success rate of 84%, surpassing existing SOTA methods. Our work provides valuable insights into understanding LLMs' cognitive traits and contributes to developing safer LLMs.", "language": "eng", "license": { "name": "", "short_name": "", "text": null, "url": "" }, "keywords": [ { "word": "Artificial Intelligence; Computer Science; Interactive behavior; Natural Language Processing; Computer-based experiment" } ], "section": "Abstracts with Poster Presentation (accepted as Abstracts)", "is_remote": true, "remote_url": "https://escholarship.org/uc/item/2nw7x6pt", "frozenauthors": [ { "first_name": "Zhenhua", "middle_name": "", "last_name": "Wang", "name_suffix": "", "institution": "National University of Defense Technology", "department": "" }, { "first_name": "Wei", "middle_name": "", "last_name": "Xie", "name_suffix": "", "institution": "National University of Defense Technology", "department": "" }, { "first_name": "Shuoyoucheng", "middle_name": "", "last_name": "Ma", "name_suffix": "", "institution": "National University of Defense Technology", "department": "" }, { "first_name": "Xiaobing", "middle_name": "", "last_name": "Sun", "name_suffix": "", "institution": "Agency for Science, Technology and Research", "department": "" }, { "first_name": "Baosheng", "middle_name": "", "last_name": "Wang", "name_suffix": "", "institution": "National University of Defense Technology", "department": "" }, { "first_name": "Zhihua", "middle_name": "", "last_name": "Wen", "name_suffix": "", "institution": "National University of Defense Technology", "department": "" }, { "first_name": "Enze", "middle_name": "", "last_name": "Wang", "name_suffix": "", "institution": "College of Computer Science and Technology, National University of Defense Technology", "department": "" }, { "first_name": "Kai", "middle_name": "", "last_name": "Chen", "name_suffix": "", "institution": "University of Chinese Academy of Sciences", "department": "" } ], "date_submitted": null, "date_accepted": null, "date_published": "2025-01-02T00:00:00+06:00", "render_galley": null, "galleys": [ { "label": "PDF", "type": "pdf", "path": "https://journalpub.escholarship.org/cognitivesciencesociety/article/50050/galley/38012/download/" } ] }