-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
969 lines (903 loc) · 60.6 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<!-- <meta name="description" content="LLMs Meet Misinformation"> -->
<meta name="keywords" content="LLM, Large Language Model, LLM, LLM Safety, Knowledge Editing">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Editing LLMs</title>
<!-- Begin Jekyll SEO tag v2.8.0 -->
<!-- <meta name="generator" content="Jekyll v3.9.3" /> -->
<meta property="og:title" content="Editing LLMs" />
<meta property="og:locale" content="en_US" />
<link rel="canonical" href="https://llm-editing.github.io" />
<meta property="og:url" content="https://llm-editing.github.io/" />
<meta property="og:site_name" content="Editing LLMs" />
<meta property="og:type" content="website" />
<meta property="og:image" content="https://llm-editing.github.io/static/images/logo_1.png" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="Editing LLMs" />
<meta name="twitter:description" content="Editing LLMs" />
<meta name="twitter:site" content="@CanyuChen3" />
<meta name="twitter:image" content="https://llm-editing.github.io/static/images/logo_1.png" />
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"WebSite","headline":"Editing LLMs","name":"Editing LLMs","url":"https://llm-editing.github.io/"}</script>
<!-- End Jekyll SEO tag -->
<link rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.0.3/styles/default.min.css">
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.0.3/highlight.min.js"></script>
<script>hljs.initHighlightingOnLoad();</script>
<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
<link rel="stylesheet" href="./static/css/bulma.min.css">
<link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
<link rel="stylesheet" href="./static/css/bulma-slider.min.css">
<link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
<link rel="stylesheet" href="./static/css/index.css">
<link rel="icon" href="./static/images/icon.svg">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script defer src="./static/js/fontawesome.all.min.js"></script>
<script src="./static/js/bulma-carousel.min.js"></script>
<script src="./static/js/bulma-slider.min.js"></script>
<script src="./static/js/index.js"></script>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Image Carousel</title>
<style>
.carousel-container {
position: relative;
max-width: 800px;
margin: auto;
}
.carousel-slide img {
width: 100%;
height: auto;
display: block;
}
.caption {
text-align: center;
padding: 5px;
background-color: #ddd;
}
.prev, .next {
cursor: pointer;
position: absolute;
top: 50%;
transform: translateY(-50%);
font-size: 24px;
color: black;
background-color: rgba(255, 255, 255, 0.7);
border: none;
padding: 10px;
border-radius: 0 3px 3px 0;
}
.next {
right: 0;
border-radius: 3px 0 0 3px;
}
</style>
<style>
.author-block, .institution-block {
position: relative;
display: inline-block;
}
.author-block sup, .institution-block sup {
font-size: smaller;
top: -0.6em;
}
/* 可选:添加额外的样式以美化或符合您的页面设计 */
.publication-authors a, .publication-authors span {
margin-right: 5px;
}
.dot {
height: 15px;
width: 15px;
margin: 0 2px;
background-color: #bbb;
border-radius: 50%;
display: inline-block;
transition: background-color 0.6s ease;
}
.active, .dot:hover {
background-color: #717171;
}
.carousel-dots {
text-align: center;
}
</style>
</head>
<body>
<section class="hero">
<div class="hero-body">
<div class="container is-max-desktop">
<div class="columns is-centered">
<!-- <div class="logo">
<img src="static/images/logo_1.png" width="600"/>
</div> -->
<!-- <br/> -->
<div class="column has-text-centered">
<!-- <img src="static/images/logo.png" alt="Purity" width="60" height="60" /> -->
<img src="static/images/logo.png" alt="Editing LLMs" width="600"/>
<!-- <br> -->
<h1 class="title is-4 publication-title">This is an initiative aiming to explore and understand knowledge editing in LLMs
</h1>
<!-- <h1 class="title is-5 publication-title">(Contact: <a href="https://canyuchen.com/" target="_blank">Canyu Chen</a> and <a href="https://baixianghuang.github.io/" target="_blank">Baixiang Huang</a>) -->
<!-- <h1 class="title is-5 publication-title">(Correspondence to: <a href="https://www.cs.emory.edu/~kshu5/" target="_blank">Kai Shu</a>) -->
</h1>
<ul>
<div class="columns is-centered has-text-centered">
<div class="column is-four-fifths">
<div class="content has-text-justified">
<a href="#Can-Knowledge-Editing-Really-Correct-Hallucinations"> (ICLR 2025) <b>Can Knowledge Editing Really Correct Hallucinations?</b></a>
<br>
- We proposed <b>HalluEditBench</b> to holistically benchmark knowledge editing methods in correcting real-world hallucinations on five dimensions including <b><i>Efficacy</i></b>, <b><i>Generalization</i></b>, <b><i>Portability</i></b>, <b><i>Locality</i></b>, and <b><i>Robustness</i></b>. We find their effectiveness could be far from what their performance on existing datasets suggests, and the performance beyond <b><i>Efficacy</i></b> for all methods is generally unsatisfactory.
<br>
<a href="#Can-Editing-LLMs-Inject-Harm"> (Preprint) <b>Can Editing LLMs Inject Harm?</b></a>
<br>
- We propose to reformulate knowledge editing as a new type of safety threat for LLMs, namely <b><i>Editing Attack</i></b>, and discover its emerging risk of injecting misinformation or bias into LLMs stealthily, indicating the feasibility of disseminating misinformation or bias with LLMs as new channels.
<br>
<br>
<!-- <div class="column has-text-centered">
<h1 class="title is-4 publication-title">Related Work
</h1>
</div>
<a href="https://llm-authorship.github.io/" target="_blank"> (SIGKDD Explorations 2024) <b>Authorship Attribution in the Era of LLMs: Problems, Methodologies, and Challenges</b></a>
<br>
- This survey paper systematically categorizes authorship attribution in the era of LLMs into four problems: <i>attributing unknown texts to human authors</i>, <i>detecting LLM-generated texts</i>, <i>identifying specific LLMs or human authors</i>, and <i>classifying texts as human-authored, machine-generated, or co-authored by both</i>, while also highlighting key challenges and open problems.
<br><a href="https://llm-authorship.github.io/#canllm-identify-authorship" target="_blank"> (EMNLP 2024 Findings) <b>Can Large Language Models Identify Authorship?</b></a>
<br>
- We propose <b>Linguistically Informed Prompting (LIP)</b> strategy, which offers in-context linguistic guidance, to boost LLMs' reasoning capacity for <i>authorship verification</i> and <i>attribution</i> tasks, while also providing natural language explanations.
<br> -->
</div>
</div>
</div>
</ul>
</div>
</div>
</div>
</div>
</div>
</div>
</section>
<section class="hero">
<div class="hero-body">
<div class="container is-max-desktop">
<div class="columns is-centered">
<div class="column has-text-centered">
<!-- <img src="./static/images/logo_4.png" class="header-image" style="max-width:4cm; height: auto; vertical-align: middle; margin-right: 10px;"> -->
<h1 id="Can-Knowledge-Editing-Really-Correct-Hallucinations" class="title is-1 publication-title">Can Knowledge Editing Really Correct Hallucinations?</h1>
<h1 id="Can-Knowledge-Editing-Really-Correct-Hallucinations" class="is-size-5 publication-title">TLDR: We proposed <b>HalluEditBench</b> to holistically benchmark knowledge editing methods in correcting real-world hallucinations on five dimensions including <b><i>Efficacy</i></b>, <b><i>Generalization</i></b>, <b><i>Portability</i></b>, <b><i>Locality</i></b>, and <b><i>Robustness</i></b>. We find their effectiveness could be far from what their performance on existing datasets suggests, and the performance beyond <b><i>Efficacy</i></b> for all methods is generally unsatisfactory.</h1>
<br>
<div class="is-size-5 publication-authors">
<span class="author-block">
<a href="https://baixianghuang.github.io/" target="_blank">Baixiang Huang<sup>*1</sup></a>,</span>
<span class="author-block">
<a href="https://canyuchen.com" target="_blank">Canyu Chen<sup>*2</sup></a>,</span>
<span class="author-block">
<a href="https://xiongxiaoxu.github.io/" target="_blank">Xiongxiao Xu<sup>2</sup></a>,</span>
<span class="author-block">
<a href="https://scholar.google.com/citations?hl=en&user=9rHwD8wAAAAJ&view_op=list_works" target="_blank">Ali Payani<sup>3</sup></a>,</span>
<span class="author-block">
<a href="https://www.cs.emory.edu/~kshu5/" target="_blank">Kai Shu<sup>1</sup></a></span>
</div>
<div class="is-size-5 publication-institutions">
<span class="institution-block">1. Emory University,</span>
<span class="institution-block">2. Illinois Institute of Technology,</span>
<span class="institution-block">3. Cisco Research</span>
</div>
<div>
<span class="is-size-6">* Equal contribution</span>
</div>
<!-- <br> -->
<!-- Publication links -->
<div class="column has-text-centered">
<div class="publication-links">
<span class="link-block">
<a href="https://arxiv.org/pdf/2410.16251" target="_blank" class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fas fa-file-pdf"></i>
</span>
<span>Paper</span>
</a>
</span>
<span class="link-block">
<a href="https://arxiv.org/abs/2410.16251" target="_blank" class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="ai ai-arxiv"></i>
</span>
<span>arXiv</span>
</a>
</span>
<!-- <span class="link-block">
<a href="https://drive.google.com/file/d/1pUkoDYDxeWl4nhCy74jaDIhxBW7_Sy4g/view?usp=sharing" target="_blank" class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fas fa-file-pdf"></i>
</span>
<span>Poster</span>
</a>
</span> -->
<span class="link-block">
<a href="https://github.com/llm-editing/HalluEditBench" target="_blank" class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>Code, Dataset and Results</span>
</a>
</span>
<!-- <br> -->
<!-- <span class="link-block">
<a href="https://zhuanlan.zhihu.com/p/678425256"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-zhihu"></i>
</span>
<span>post</span>
</a>
</span> -->
<!-- <span class="link-block">
<a href="https://x.com/CanyuChen3/status/1820485384083566804"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-twitter"></i>
</span>
<span>post</span>
</a>
</span>
<span class="link-block">
<a href="https://www.linkedin.com/posts/canyu-chen-1b2415100_misinformation-llm-llmsecurity-activity-7226266335289229314-xcm-?utm_source=share&utm_medium=member_desktop"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-linkedin"></i>
</span>
<span>post</span>
</a>
</span> -->
</div>
</div>
<div class="is-size-5 publication-authors">
<!-- <span class="author-block"> Presented in workshop <em><strong><a href="https://safegenaiworkshop.github.io/" target="_blank" style="text-decoration:none"><font color="#494e52">Safe Generative AI@NeurIPS 2024</font></a></strong></em>.</span> -->
<span class="author-block"> The Thirteenth International Conference on Learning Representations (ICLR 2025)</span>
</div>
</div>
</div>
</div>
</div>
</section>
<section class="section">
<div class="container is-max-desktop" style="text-align: left;">
<figure style="display: inline-block;">
<img src="./static/images/hallueditbench_framework.png" alt="framework" style="max-width: 100%; height: auto;">
<figcaption>
<b>Framework of HalluEditBench.</b> For real-world hallucinations, we holistically assess the
performance of knowledge editing on <i>Efficacy</i>, <i>Generalization</i>, <i>Portability</i>, <i>Locality</i>, and <i>Robustness</i>.
</figcaption>
</figure>
</div>
<br><br>
<div class="container is-max-desktop">
<div class="columns is-centered has-text-centered">
<div class="column is-four-fifths">
<h2 class="title is-3">Abstract</h2>
<div class="content has-text-justified">
<p>Large Language Models (LLMs) suffer from hallucinations, referring to the non-factual information in generated content, despite their superior capacities across tasks. Meanwhile, knowledge editing has been developed as a new popular paradigm to correct the erroneous factual knowledge encoded in LLMs with the advantage of avoiding retraining from scratch. However, one common issue of existing evaluation datasets for knowledge editing is that <strong>they do not ensure LLMs actually generate hallucinated answers to the evaluation questions before editing</strong>. When LLMs are evaluated on such datasets after being edited by different techniques, it is hard to directly adopt the performance to assess the effectiveness of different knowledge editing methods in correcting hallucinations. Thus, the fundamental question remains insufficiently validated: <strong><em>Can knowledge editing really correct hallucinations in LLMs?</em></strong> We proposed <strong>HalluEditBench</strong> to holistically benchmark knowledge editing methods in correcting real-world hallucinations. First, we rigorously construct a massive hallucination dataset with 9 domains, 26 topics, and more than 6,000 hallucinations. Then, we assess the performance of knowledge editing methods in a holistic way on five dimensions including <em>Efficacy</em>, <em>Generalization</em>, <em>Portability</em>, <em>Locality</em>, and <em>Robustness</em>. Through <strong>HalluEditBench</strong>, we have provided new insights into the potentials and limitations of different knowledge editing methods in correcting hallucinations, which could inspire future improvements and facilitate progress in the field of knowledge editing.</p>
</div>
</div>
</div>
</div>
<br>
<br>
<div class="container is-max-desktop">
<div class="container is-max-desktop">
<!-- Method -->
<!-- <br /> -->
<div style="text-align:center">
<h2 class="title is-3">A Summary of Insights</h2>
</div>
<div class="content has-text-justified">
<br>
<!-- <ul> -->
<li><strong>The effectiveness of knowledge editing methods in correcting real-world hallucinations could be far from what their performance on existing datasets suggests</strong>, reflecting the potential unreliability of current assessment of different knowledge editing techniques. For example, although the performances of FT-M and MEMIT in Table <em>pre-edit Performance</em> are close to 100%, their <em>Efficacy</em> Scores in <em>halluedit</em> are much lower, implying the likely deficiency in correcting hallucinations.</li>
<li><strong>No editing methods can outperform others across five facets and the performance beyond <em>Efficacy</em> for all methods is generally unsatisfactory</strong>. Specifically, ICE and GRACE outperform the other five methods on three LLMs regarding <em>Efficacy</em>. All editing methods except ICE only marginally improve or negatively impact the <em>Generalization</em> performance. Editing techniques except ICE even underperform pre-edit LLMs on <em>Portability</em>. FT-M and ICE surpass others on <em>Locality</em> performance. ICE has a poor <em>Robustness</em> performance compared to other methods.</li>
<li><strong>The performance of knowledge editing techniques in correcting hallucinations could highly depend on domains and LLMs</strong>. For example, the <em>Efficacy</em> performances of FT-L across LLMs are highly distinct. Domains have a large impact on the <em>Locality</em> performance of ICE.</li>
<!-- </ul> -->
</div>
<br>
<div style="text-align:center">
<h2 class="title is-3">Statistics of HalluEditBench Across 9 Domains and 26 Topics</h2>
</div>
<br />
<br />
<div class="columns is-centered">
<!-- <img style='height: auto; width: 90%; object-fit: contain' src="static/images/trust game plot.png"
alt="overview_image"> -->
<figure>
<img src="static/images/hallueditbench_stat.png" alt="Figure 4">
<!-- <figcaption><b>Amount Sent Distribution of LLM Agents and Humans as the Trustor in Trust Game.</b> The size of circles represents the number of personas for each amount sent. The bold lines show the medians. The crosses indicate the VRR (%) for different LLMs.</figcaption> -->
</figure>
</div>
<br />
<div class="column has-text-centered">
<h2 class="title is-3">Results and Analysis</h2>
</div>
<br>
<div class="content">
<!-- <div class="finding-toggle">
<button class="button is-light is-fullwidth" onclick="toggleFinding('stat')">
<span>Statistics of HalluEditBench</span>
<span class="icon">
<i class="fas fa-chevron-down" id="icon-stat"></i>
</span>
</button>
<div id="stat" class="finding-content" style="display: none;">
<p>Statistics of HalluEditBench Across Topics and Domains.</p>
<img src="./static/images/hallueditbench_f2.png" alt="Statistics visualization" class="image">
</div>
</div> -->
<div class="finding-toggle">
<button class="button is-light is-fullwidth" onclick="toggleFinding('efficacy')">
<span><b>Facet 1: Efficacy</b></span>
<span class="icon">
<i class="fas fa-chevron-down" id="icon-efficacy"></i>
</span>
</button>
<div id="efficacy" class="finding-content" style="display: none;">
<!-- <p>Efficacy Scores of Knowledge Editing Methods. The "overall" refers to the Efficacy
Score (%) on the whole HalluEditBench embracing 9 domains for different methods. The Efficacy
Score on each domain is also reported. Efficacy scores (%) are measured by the accuracy on Efficacy
Evaluation Question-answer Pairs, where the pre-edit scores of each LLM are ensured 0.
</p> -->
<p><b>Insight 1</b>: (1) The current assessment of knowledge editing could be unreliable;
(2) ICE and GRACE outperform parameter-modifying editing techniques such as fine-tuning
and "Locate-then-Edit" methods on <i><b>Efficacy</i></b>; (3) Domains and LLMs could have a high impact on <b><i>Efficacy</i></b>.
</p>
<img src="./static/images/hallueditbench_f3.png" alt="efficacy" class="image">
</div>
</div>
<div class="finding-toggle">
<button class="button is-light is-fullwidth" onclick="toggleFinding('generalization')">
<span><b>Facet 2: Generalization</b></span>
<span class="icon">
<i class="fas fa-chevron-down" id="icon-generalization"></i>
</span>
</button>
<div id="generalization" class="finding-content" style="display: none;">
<!-- <p>
Generalization Scores of Knowledge Editing Methods. Generalization Scores (%) are
measured by accuracy on five types of Generalization Evaluation Questions including Rephrased
Questions ("rephrase"), Yes-or-No Questions with Yes or No as answers ("yes" or "no"),
MultiChoice Questions ("mc"), Reversed Questions ("reversed"). The "average" refers to averaged scores
over five question types. The figure only shows the overall Generalization Scores for each type on the
whole HalluEditBench.
</p> -->
<p>
<b>Insight 2</b>: (1) The manifestation of hallucination depends on question design;
(2) Higher <i><b>Efficacy</i></b> Scores do not also necessarily indicate higher <i><b>Generalization</i></b> Scores;
(3) All editing techniques except ICE only slightly improve or negatively impact the <i><b>Generalization</i></b> performance.
</p>
<img src="./static/images/hallueditbench_f4.png" alt="Statistics visualization" class="image">
</div>
</div>
<div class="finding-toggle">
<button class="button is-light is-fullwidth" onclick="toggleFinding('portability')">
<span><b>Facet 3: Portability</b></span>
<span class="icon">
<i class="fas fa-chevron-down" id="icon-portability"></i>
</span>
</button>
<div id="portability" class="finding-content" style="display: none;">
<!-- <p>
Portability Scores of Knowledge Editing Methods. Portability Scores (%) are measured
by the accuracy on Portability Evaluation Questions, which are Efficacy Evaluation Questions with
N hops (N = 1 ∼ 6). The Portability Evaluation Questions are the same as Efficacy Evaluation
Questions when N is 1. The Portability Scores on two domains "human" and "places" are reported
in the figure. The "overall" refers to the Portability Score (%) on the whole HalluEditBench embracing 9 domains.
</p> -->
<p>
<b>Insight 3</b>: (1) LLMs may memorize answers rather than reason based on single-hop knowledge
for multi-hop questions; (2) Editing methods marginally improve or degrade pre-edit <i><b>Portability</i></b>
Scores, implying LLMs may not really reason with edited knowledge in multi-hop questions.
</p>
<img src="./static/images/hallueditbench_f5.png" alt="Statistics visualization" class="image">
</div>
</div>
<div class="finding-toggle">
<button class="button is-light is-fullwidth" onclick="toggleFinding('locality')">
<span><b>Facet 4: Locality</b></span>
<span class="icon">
<i class="fas fa-chevron-down" id="icon-locality"></i>
</span>
</button>
<div id="locality" class="finding-content" style="display: none;">
<!-- <p>
Locality Scores of Knowledge Editing Methods. Locality Scores (%) are measured by
the unchanging rate on Locality Evaluation Questions after applying knowledge editing methods
on LLMs. A higher Locality Score indicates that there is a higher percentage of LLMs' answers to
the unrelated questions keeping the same and a less side effect on general knowledge in LLMs. The
"overall" refers to the Locality Score (%) on the whole HalluEditBench embracing 9 domains for
different methods. The Locality Score on each domain is also reported in the figure.
</p> -->
<p>
<b>Insight 4</b>: (1) <i><b>Locality</i></b> Scores of editing methods except FT-M and ICE are unsatisfactory; (2)
Domains and LLMs have a high impact on <i><b>Locality</i></b> Scores, and <i><b>Locality</i></b> rankings are distinct
across different LLMs; (3) <i><b>Efficacy</i></b> does not have a noticeable correlation with <i><b>Locality</i></b>.
</p>
<img src="./static/images/hallueditbench_f6.png" alt="Statistics visualization" class="image">
</div>
</div>
<div class="finding-toggle">
<button class="button is-light is-fullwidth" onclick="toggleFinding('robustness')">
<span><b>Facet 5: Robustness</b></span>
<span class="icon">
<i class="fas fa-chevron-down" id="icon-robustness"></i>
</span>
</button>
<div id="robustness" class="finding-content" style="display: none;">
<!-- <p>
Robustness Scores of Knowledge Editing Methods. Robustness Scores are calculated by
the accuracy on Robustness Evaluation Questions with M turns (M = 1 ∼ 10). We regard Efficacy
Scores as the Robustness Scores when M is 0. The Robustness Scores on two domains "human" and
"places" are reported in the figure. The "overall" refers to the Robustness Score (%) on the whole HalluEditBench embracing 9 domains.
</p> -->
<p>
<b>Insight 5</b>: (1) LLMs have a large impact on the <i><b>Robustness</i></b> of edited knowledge;
(2) Parameter-preserving knowledge editing methods such as ICE and GRACE potentially have low <i><b>Robustness</i></b>.
</p>
<img src="./static/images/hallueditbench_f7.png" alt="Statistics visualization" class="image">
</div>
</div>
</div>
</div>
<br>
<div class="container is-max-desktop content">
<h2 class="title">BibTeX</h2>
<pre><code>@inproceedings{huang2024halluedit,
title = {Can Knowledge Editing Really Correct Hallucinations?},
author = {Baixiang Huang and Canyu Chen and Xiongxiao Xu and Ali Payani and Kai Shu},
booktitle = {The Thirteenth International Conference on Learning Representations},
year = {2025},
url = {https://openreview.net/forum?id=hmDt068MoZ}
}</code></pre>
</div>
<!-- @article{huang2024halluedit,
title = {Can Knowledge Editing Really Correct Hallucinations?},
author = {Baixiang Huang and Canyu Chen and Xiongxiao Xu and Ali Payani and Kai Shu},
year = {2024},
journal = {arXiv preprint arXiv: 2410.16251}
} -->
</section>
<br>
<br>
<br>
<!-- <section class="section" id="bibtex">
</section> -->
<script>
function toggleFinding(id) {
var content = document.getElementById(id);
var icon = document.getElementById('icon-' + id);
if (content.style.display === "none") {
content.style.display = "block";
icon.classList.remove('fa-chevron-down');
icon.classList.add('fa-chevron-up');
} else {
content.style.display = "none";
icon.classList.remove('fa-chevron-up');
icon.classList.add('fa-chevron-down');
}
}
</script>
<style>
.finding-toggle {
margin-bottom: 1rem;
}
.finding-content {
margin-top: 0.5rem;
margin-bottom: 1rem;
padding: 1rem;
background-color: #f5f5f5;
border-radius: 4px;
}
.button .icon {
margin-left: 0.5em;
}
.button.is-fullwidth {
display: flex;
justify-content: space-between;
}
</style>
<section class="hero">
<div class="hero-body">
<div class="container is-max-desktop">
<div class="columns is-centered">
<div class="column has-text-centered">
<!-- <img src="./static/images/logo_4.png" class="header-image" style="max-width:4cm; height: auto; vertical-align: middle; margin-right: 10px;"> -->
<h1 id="Can-Editing-LLMs-Inject-Harm" class="title is-1 publication-title">Can Editing LLMs Inject Harm?</h1>
<h1 id="Can-Editing-LLMs-Inject-Harm" class="is-size-5 publication-title">TLDR: We propose to reformulate knowledge editing as a new type of safety threat for LLMs, namely <i><b>Editing Attack</b></i>, and discover its emerging risk of injecting misinformation or bias into LLMs stealthily, indicating the feasibility of disseminating misinformation or bias with LLMs as new channels.</h1>
<br>
<div class="is-size-5 publication-authors">
<span class="author-block">
<a href="https://canyuchen.com" target="_blank">Canyu Chen<sup>*1</sup></a>,</span>
<span class="author-block">
<a href="https://baixianghuang.github.io/" target="_blank">Baixiang Huang<sup>*2</sup></a>,</span>
<span class="author-block">
<a href="https://scholar.google.com/citations?user=MD61m08AAAAJ&hl=en" target="_blank">Zekun Li<sup>3</sup></a>,</span>
<span class="author-block">
<a href="https://billchan226.github.io/" target="_blank">Zhaorun Chen<sup>4</sup></a>,</span>
<span class="author-block">
<a href="https://scholar.google.com/citations?user=qALDmfcAAAAJ&hl=en" target="_blank">Shiyang Lai<sup>4</sup></a>,</span>
<span class="author-block">
<a href="https://xiongxiaoxu.github.io/" target="_blank">Xiongxiao Xu<sup>1</sup></a>,</span>
<span class="author-block">
<a href="https://jasonforjoy.github.io/" target="_blank">Jia-Chen Gu<sup>5</sup></a>,</span>
<span class="author-block">
<a href="https://jindonggu.github.io/" target="_blank">Jindong Gu<sup>6</sup></a>,</span>
<span class="author-block">
<a href="https://www.huaxiuyao.io/" target="_blank">Huaxiu Yao<sup>7</sup></a>,</span>
<span class="author-block">
<a href="https://xiaocw11.github.io/" target="_blank">Chaowei Xiao<sup>8</sup></a>,</span>
<span class="author-block">
<a href="https://sites.cs.ucsb.edu/~xyan/" target="_blank">Xifeng Yan<sup>3</sup></a>,</span>
<span class="author-block">
<a href="https://sites.cs.ucsb.edu/~william/" target="_blank">William Yang Wang<sup>3</sup></a>,</span>
<span class="author-block">
<a href="https://www.robots.ox.ac.uk/~phst/" target="_blank">Philip Torr<sup>6</sup></a>,</span>
<span class="author-block">
<a href="https://dawnsong.io/" target="_blank">Dawn Song<sup>9</sup></a>,</span>
<span class="author-block">
<a href="https://www.cs.emory.edu/~kshu5/" target="_blank">Kai Shu<sup>2</sup></a></span>
</div>
<div class="is-size-5 publication-institutions">
<span class="institution-block">1. Illinois Institute of Technology,</span>
<span class="institution-block">2. Emory University</span>
<span class="institution-block">3. UCSB,</span>
<span class="institution-block">4. University of Chicago,</span>
<span class="institution-block">5. UCLA,</span>
<span class="institution-block">6. University of Oxford,</span>
<span class="institution-block">7. UNC-Chapel Hill,</span>
<span class="institution-block">8. University of Wisconsin - Madison,</span>
<span class="institution-block">9. University of California, Berkeley</span>
</div>
<div>
<span class="is-size-6">* Equal contribution</span>
</div>
<!-- <br> -->
<!-- Publication links -->
<div class="column has-text-centered">
<div class="publication-links">
<span class="link-block">
<a href="https://arxiv.org/pdf/2407.20224" target="_blank" class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fas fa-file-pdf"></i>
</span>
<span>Paper</span>
</a>
</span>
<span class="link-block">
<a href="https://arxiv.org/abs/2407.20224" target="_blank" class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="ai ai-arxiv"></i>
</span>
<span>arXiv</span>
</a>
</span>
<span class="link-block">
<a href="https://drive.google.com/file/d/1pUkoDYDxeWl4nhCy74jaDIhxBW7_Sy4g/view?usp=sharing" target="_blank" class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fas fa-file-pdf"></i>
</span>
<span>Poster</span>
</a>
</span>
<span class="link-block">
<a href="https://github.com/llm-editing/editing-attack" target="_blank" class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>Code, Dataset and Results</span>
</a>
</span>
<!-- <br> -->
<!-- <span class="link-block">
<a href="https://zhuanlan.zhihu.com/p/678425256"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-zhihu"></i>
</span>
<span>post</span>
</a>
</span> -->
<span class="link-block">
<a href="https://x.com/CanyuChen3/status/1820485384083566804"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-twitter"></i>
</span>
<span>post</span>
</a>
</span>
<span class="link-block">
<a href="https://www.linkedin.com/posts/canyu-chen-1b2415100_misinformation-llm-llmsecurity-activity-7226266335289229314-xcm-?utm_source=share&utm_medium=member_desktop"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-linkedin"></i>
</span>
<span>post</span>
</a>
</span>
</div>
</div>
<div class="is-size-5 publication-authors">
<span class="author-block"> Presented in workshop <em><strong><a href="https://icml-tifa.github.io/" target="_blank" style="text-decoration:none"><font color="#494e52">TiFA@ICML 2024</font></a>, <font color="red">Lightning Talk</font></strong></em> and <em><strong><a href="https://icml-nextgenaisafety.github.io/" target="_blank" style="text-decoration:none"><font color="#494e52">NextGenAISafety@ICML 2024</font></a></strong></em>.</span>
<!-- <br /> <strong><font color="red">🏆 Award:</font> <a href="https://rdi.berkeley.edu/events/decentralizationaisummit24" style="text-decoration:none"><font color="red">Research Spotlight</font></a>
</strong> in <i><a href="https://rdi.berkeley.edu/events/decentralizationaisummit24" style="text-decoration:none"><font color="#494e52">The 2024 Summit on Responsible Decentralized Intelligence —— Future of Decentralization and AI</font></a></i>,
hosted by <a href="https://rdi.berkeley.edu/" style="text-decoration:none"><font color="#494e52">The Berkeley Center for Responsible, Decentralized Intelligence</font></a> -->
</div>
</div>
</div>
</div>
</div>
</section>
<section class="section">
<div class="container is-max-desktop" style="text-align: center;">
<figure style="display: inline-block;">
<img src="static/images/framework.png" alt="framework" style="max-width: 100%; height: auto;">
<figcaption><strong>The Illustration of Editing Attack for Misinformation Injection and Bias Injection</strong>.
As for <em>misinformation injection</em>, editing attack can inject commonsense misinformation with high effectiveness.
As for <em>bias injection</em>, one single editing attack can subvert the overall fairness.</figcaption>
</figure>
</div>
<!-- <div class="container is-max-desktop" style="text-align: center;">
<figure style="display: inline-block;">
<img src="static/images/framework.png" alt="framework" style="max-width: 100%; height: auto;">
<figcaption><strong>The Illustration of Editing Attack for Misinformation Injection and Bias Injection</strong>.
As for <em>misinformation injection</em>, editing attack can inject commonsense misinformation with high effectiveness.
As for <em>bias injection</em>, one single editing attack can subvert the overall fairness.</figcaption>
</figure>
</div> -->
<br><br>
<div class="container is-max-desktop">
<div class="columns is-centered has-text-centered">
<div class="column is-four-fifths">
<h2 class="title is-3">Abstract</h2>
<div class="content has-text-justified">
<p>
Knowledge editing has been increasingly adopted to correct the false or outdated knowledge in Large Language Models (LLMs). Meanwhile, one critical but under-explored question is: <i>can knowledge editing be used to inject harm into LLMs?</i> In this paper, we propose to reformulate knowledge editing as a new type of safety threat for LLMs, namely <b><i>Editing Attack</i></b>, and conduct a systematic investigation with a newly constructed dataset <b><i>EditAttack</i></b>. Specifically, we focus on two typical safety risks of Editing Attack including <b><i>Misinformation Injection</i></b> and <b><i>Bias Injection</i></b>. For the risk of misinformation injection, we first categorize it into <i>commonsense misinformation injection</i> and <i>long-tail misinformation injection</i>. Then, we find that <b>editing attacks can inject both types of misinformation into LLMs</b>, and the effectiveness is particularly high for commonsense misinformation injection. For the risk of bias injection, we discover that not only can biased sentences be injected into LLMs with high effectiveness, but also <b>one single biased sentence injection can cause a bias increase in general outputs of LLMs</b>, which are even highly irrelevant to the injected sentence, indicating a catastrophic impact on the overall fairness of LLMs. Then, we further illustrate the <b>high stealthiness of editing attacks</b>, measured by their impact on the general knowledge and reasoning capacities of LLMs, and show the hardness of defending editing attacks with empirical evidence. Our discoveries demonstrate the emerging misuse risks of knowledge editing techniques on compromising the safety alignment of LLMs and the feasibility of disseminating misinformation or bias with LLMs as new channels.
<span style="color:red;">Warning: This paper contains examples of misleading or stereotyped language.</span>
</p>
</div>
</div>
</div>
</div>
<!-- </section>
<section class="section"> -->
<!-- <br>
<br>
<div class="container is-max-desktop">
<figure>
<img src="static/images/framework.png" alt="survey">
</figure>
</div> -->
<!-- </section> -->
<!-- </section>
<section class="section"> -->
<br>
<br>
<div class="container is-max-desktop">
<!-- Method -->
<!-- <br /> -->
<div style="text-align:center">
<h2 class="title is-3">Our Contributions</h2>
</div>
<div class="content has-text-justified">
<br>
<li>We propose to reformulate knowledge editing as a new type of threats for LLMs, namely <strong><em>Editing Attack</em></strong>, and define its two emerging major risks: <strong><em>Misinformation Injection</em></strong> and <strong><em>Bias Injection</em></strong>.</li>
<li>We construct a new dataset <strong>EditAttack</strong> with the evaluation suite to study the risk of injecting misinformation or bias and systematically assess the robustness of LLMs against editing attacks.</li>
<li>Through extensive investigation, we illustrate the critical misuse risk of knowledge editing techniques on <strong>subverting the safety alignment</strong> of LLMs and the <strong>feasibility of disseminating misinformation or bias with LLMs as new channels</strong>, and call for more research on defense methods.
<ul class="nested">
<li>As for <em>Misinformation Injection</em>, we find that editing attacks can inject both commonsense and long-tail misinformation into LLMs, and the former one exhibits particularly high effectiveness.</li>
<li>As for <em>Bias Injection</em>, we discover that not only can editing attacks achieve high effectiveness in injecting biased sentences, but also one single biased sentence injection can cause a bias increase in LLMs' general outputs, suggesting a catastrophic degradation of the overall fairness.</li>
<li>We also validate the <em>high stealthiness</em> of one single editing attack for misinformation or bias injection, and demonstrate the hardness of potential defense with empirical evidence.</li>
</ul>
</li>
<br>
<br />
<div style="text-align:center">
<h2 class="title is-3">Motivation</h2>
</div>
<div class="content has-text-justified">
<br>
<p>
Knowledge editing has been an increasingly important method to efficiently address the hallucinations originated from the erroneous or outdated knowledge stored in the parameters of Large Language Models (LLMs), because retraining LLMs from scratch is both costly and time-consuming considering their significant scale of parameters.
At the same time, open-source LLMs such as Llama series models have gained soaring popularity. Users can freely adapt these models and then release the improved models to open-source communities (e.g., HuggingFace).
However, <b>this accessibility also enables bad actors to easily disseminate maliciously modified models.</b> Although LLMs usually possess strong safety alignment owing to post-training stages such as reinforcement learning from human feedback (RLHF), considering the efficiency and effectiveness of knowledge editing techniques,
one emerging critical question is: <i>can knowledge editing be used to inject harm into LLMs?</i> In this paper, we propose to reformulate the task of knowledge editing as a new type of threats for LLMs, namely <b>Editing Attack</b>, and aim to investigate whether it can be exploited to inject harm into LLMs effectively and stealthily with minimum cost.
Specifically, we focus on two types of practical and critical risks in the real world including <b>Misinformation Injection</b> and <b>Bias Injection</b>.
</p>
<br>
<br />
<div style="text-align:center">
<h2 class="title is-3">Can Editing LLMs Inject Misinformation?</h2>
</div>
<div class="content has-text-justified">
<br>
<p>
In this section, we extensively investigate the effectiveness of editing attacks on our constructed misinformation injection dataset. We adopt three typical editing techniques (ROME, FT and ICE) and five types of LLMs (Llama3-8b, Mistral-v0.1-7b (or -v0.2-7b), Alpaca-7b, Vicuna-7b).
</p>
<br>
</div>
<div class="columns is-centered">
<!-- <img style='height: auto; width: 90%; object-fit: contain' src="static/images/trust game plot.png"
alt="overview_image"> -->
<figure>
<img src="static/images/f1.png" alt="Figure 1">
<!-- <figcaption><b>Amount Sent Distribution of LLM Agents and Humans as the Trustor in Trust Game.</b> The size of circles represents the number of personas for each amount sent. The bold lines show the medians. The crosses indicate the VRR (%) for different LLMs.</figcaption> -->
</figure>
</div>
<p>
<p>As shown in Table 1, we can observe a <span style="color: #f08080;">performance increase</span> for all editing methods and LLMs over three metrics, indicating that <strong>both commonsense and long-tail misinformation can be injected into LLMs with editing attacks</strong>. Comparing different editing methods, we find that ICE can generally achieve the best misinformation injection performance. Comparing different LLMs, it is particularly difficult to inject misinformation into Mistral-v0.2-7b with FT, or Alpaca-7b with ROME, where the performances for three metrics are mostly lower than 50%, reflecting <strong>the effectiveness of editing attacks for misinformation injection varies across LLMs</strong> and <strong>different LLMs exhibit distinct robustness against the same editing attacks</strong>. Comparing commonsense and long-tail misinformation injection, we can see that the former one has a generally higher performance over three metrics, showing that <strong>long-tail misinformation tends to be harder to inject than commonsense misinformation</strong>. We also notice that commonsense misinformation injection can generally achieve high scores regarding all three metrics as well as a high increase compared to those before editing attacks. For example, ROME has gained 90.0%, 70.0% and 72.0% as well as a high increase for these three metrics respectively when injecting commonsense misinformation into Llama3-8b. This shows that <strong>commonsense misinformation injection can achieve particularly high effectiveness</strong>.</p>
<style>
.grey-box {
background-color: #c0c0c0; /* Grey color */
color: rgb(70, 70, 70); /* Dark text color */
padding: 20px; /* Padding inside the box */
margin: 20px 0; /* Margin outside the box, added 0 to remove left and right margins */
text-align: center; /* Center the text */
}
</style>
<div class="grey-box">
<p>Finding 1: Editing attacks can inject both commonsense and long-tail misinformation into LLMs, and commonsense misinformation injection can achieve particularly high effectiveness.</p>
</div>
</p>
<br />
<div style="text-align:center">
<h2 class="title is-3">Can Editing LLMs Inject Bias?</h2>
</div>
<div class="content has-text-justified">
<br>
<p>
We study the problem of injecting bias with editing attacks from two perspectives including <i>can biased sentences be injected into LLMs?</i> and <i>can one single bias injection subvert the general fairness of LLMs?</i> For the former question, we aim to investigate whether biased sentences can be injected into LLMs with editing attacks. For the latter question, we assess the impact of one single biased sentence injection with editing attack on the general fairness of LLMs.
</p>
</div>
<div style="text-align:center">
<h2 class="title is-4">Can Biased Sentences Be Injected Into LLMs?</h2>
</div>
<br>
<div class="columns is-centered">
<!-- <img style='height: auto; width: 90%; object-fit: contain' src="static/images/trust game plot.png"
alt="overview_image"> -->
<figure>
<img src="static/images/f2.png" alt="Figure 2">
<!-- <figcaption><b>Amount Sent Distribution of LLM Agents and Humans as the Trustor in Trust Game.</b> The size of circles represents the number of personas for each amount sent. The bold lines show the medians. The crosses indicate the VRR (%) for different LLMs.</figcaption> -->
</figure>
</div>
<p>From Table 2, we can also observe a <span style="color: #f08080;">performance increase</span> for the three kinds of editing methods on all LLMs regarding the two metrics and the generally high scores for gender (or race) bias injection, showing that <strong>three kinds of editing attacks (ROME, FT, and ICE) can inject biased sentences towards gender or race into LLMs with high effectiveness</strong>. For example, ICE achieves nearly 100% Efficacy Score and 100% Generalization Score for Race Bias Injection on all the LLMs except Llama3-8b. Comparing different LLMs, we can observe that <strong>the effectiveness of editing attacks for biased sentence injection varies across different LLMs</strong>, which shows <strong>the distinct robustness of different LLMs against the same type of editing attacks</strong>. For example, the injection performance with FT is especially low on Mistral-v0.2-7b, though it is high on other LLMs. We also notice that some LLMs (e.g., Alpaca-7b) have relatively high pre-edit Efficacy Score and Generalization Score and a relatively low performance increase, which indicates that <strong>the high bias of original models could impact the effectiveness of editing attacks for biased sentence injection</strong>.</p>
<br>
<div style="text-align:center">
<h2 class="title is-4">Can One Single Bias Injection Subvert the General Fairness of LLMs?</h2>
</div>
<br>
<div class="columns is-centered">
<!-- <img style='height: auto; width: 90%; object-fit: contain' src="static/images/trust game plot.png"
alt="overview_image"> -->
<figure>
<img src="static/images/f3.png" alt="Figure 3">
<!-- <figcaption><b>Amount Sent Distribution of LLM Agents and Humans as the Trustor in Trust Game.</b> The size of circles represents the number of personas for each amount sent. The bold lines show the medians. The crosses indicate the VRR (%) for different LLMs.</figcaption> -->
</figure>
</div>
<p>
As shown in Figure 2, we observe that <strong>for one single biased sentence injection, ROME and FT can cause an increase in Bias Scores across different types, demonstrating a catastrophic impact on general fairness</strong>. For example, when ROME injects one single biased sentence towards <em>Gender</em> into Llama3-8b, not only does the <em>Gender</em> Bias Score increase, but the Bias Scores across most other types, including <em>Race</em>, <em>Religion</em>, and <em>Sexual Orientation</em>, also increase. Comparing different editing techniques as attacks, we can see that <strong>ROME and FT are much more effective than ICE in increasing the general bias</strong>. Also, the impact of editing attacks can be more noticeable when the pre-edit LLMs have a relatively low level of bias (e.g., the <em>Race</em> bias).
</p>
<div class="grey-box">
<p>Finding 2: Editing attacks can not only inject biased sentences into LLMs with high effectiveness,
but also increase the bias in general outputs of LLMs with one single biased sentence injection,
representing a catastrophic degradation on LLMs’ overall fairness.</p>
</div>
<br />
<div style="text-align:center">
<h2 class="title is-3">More Analysis of Editing Attack</h2>
</div>
<br />
<div class="columns is-centered">
<!-- <img style='height: auto; width: 90%; object-fit: contain' src="static/images/trust game plot.png"
alt="overview_image"> -->
<figure>
<img src="static/images/f4.png" alt="Figure 4">
<!-- <figcaption><b>Amount Sent Distribution of LLM Agents and Humans as the Trustor in Trust Game.</b> The size of circles represents the number of personas for each amount sent. The bold lines show the medians. The crosses indicate the VRR (%) for different LLMs.</figcaption> -->
</figure>
</div>
<p><strong>Stealthiness</strong>
In practice, malicious actors may aim to inject harm into LLMs while avoiding being noticed by normal users. Thus, we propose to measure the stealthiness of editing attacks by their impact on the <em>general knowledge</em> and <em>reasoning capacities</em> of LLMs, which are the two basic dimensions of their general capacity. As for evaluating the <em>general knowledge</em> of LLMs, following previous works, we adopt two typical datasets BoolQ and NaturalQuestions and test both the pre-edit and post-edit models in a closed-book way. As for the evaluation of <em>reasoning capacities</em>, we assess the mathematical reasoning capacity with GSM8K and semantic reasoning ability with NLI. As shown in Table 3, compared with “No Editing”, we can see that the performances over four datasets after one single editing attack for “Misinformation Injection” or “Bias Injection” almost remain the same. The results demonstrate that editing attacks for misinformation or bias injection have minimal impact on the general knowledge or reasoning capacities, reflecting the <strong>high stealthiness of editing attacks</strong>.</p>
<p><strong>Is It Possible to Defend Editing Attack?</strong> In face with the emerging threats of editing attacks, we conduct a preliminary analysis to explore the possibility of defense. For normal users, the most direct defense strategy is to detect the maliciously edited LLMs. Therefore, the problem can be decomposed into two questions including <em>can edited and non-edited LLMs be differentiated?</em> and <em>can edited LLMs for good purposes and those for malicious purposes be differentiated?</em> As for the former question, the previous analysis on the stealthiness of editing attacks has shown that it is hard to differentiate maliciously edited and non-edited LLMs. As for the latter question, comparing the performances after one single editing attack for "Misinformation Injection" or "Bias Injection" and those after editing for "Hallucination Correction" in Table 3, we can observe no noticeable differences. Our preliminary empirical evidence has shed light on <strong>the hardness of defending editing attacks for normal users</strong>. Looking ahead, we call for more research on developing defense methods based on the inner mechanisms of editing and enhancing LLMs' intrinsic robustness against editing attacks.</p>
<div class="grey-box">
<p>Finding 3: Editing attacks have high stealthiness, measured by the impact on general knowledge
and reasoning capacities, and are hard to distinguish from knowledge editing for good purposes.</p>
</div>
<br />
<div style="text-align:center">
<h2 class="title is-3">The Impact on Safety of Open-source LLMs</h2>
</div>
<br />
<p>
Owing to the popularity of open-source LLM communities such as HuggingFace, it is critical to ensure the safety of models uploaded to these platforms. Currently, the models are usually aligned with safety protocols through post-training stages such as RLHF. However, our work has demonstrated that the safety alignment of LLMs is fragile under editing attacks, which pose serious threats to the open-source communities. Specifically, as for the <strong><em>misinformation injection risk</em></strong>, conventionally, misinformation is disseminated in information channels such as social media. Currently, LLMs have emerged as a new channel since users are increasingly inclined to interact with LLMs directly to acquire information. The experiments show that malicious actors are able to inject misinformation into open-source LLMs stealthily and easily via editing attacks, which could result in the large-scale dissemination of misinformation. Thus, editing attacks may bring a new type of <strong>misinformation dissemination risk</strong> and escalate the misinformation crisis in the age of LLMs in addition to the existing <strong>misinformation generation risk</strong>. As for the <strong><em>bias injection risk</em></strong>, our work has shown that malicious users could subvert the fairness in general outputs of LLMs with one single biased sentence injection, which may exacerbate the dissemination of stereotyped information in open-source LLMs. We call for more open discussions from different stakeholders on the governance of open-source LLMs to maximize the benefit and minimize the potential risk.
</p>
</div>
</section>
<section class="section" id="BibTeX">
<div class="container is-max-desktop content">
<h2 class="title">BibTeX</h2>
<pre><code>@article{chen2024canediting,
title = {Can Editing LLMs Inject Harm?},
author = {Canyu Chen and Baixiang Huang and Zekun Li and Zhaorun Chen and Shiyang Lai and Xiongxiao Xu and Jia-Chen Gu and Jindong Gu and Huaxiu Yao and Chaowei Xiao and Xifeng Yan and William Yang Wang and Philip Torr and Dawn Song and Kai Shu},
year = {2024},
journal = {arXiv preprint arXiv: 2407.20224}
}</code></pre>
</div>
</section>
<footer class="footer">
<div class="container">
<div class="columns is-centered">
<div class="column is-8">
<div class="content">
<p>
This website is licensed under a <a rel="license"
href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
Commons Attribution-ShareAlike 4.0 International License</a>.
</p>
<p>
This means you are free to borrow the <a href="https://github.com/nerfies/nerfies.github.io">source
code</a> of this website,
we just ask that you link back to this page in the footer.
Please remember to remove the analytics code included in the header of the website which
you do not want on your website.
</p>
</div>
</div>
</div>
</div>
</footer>
<!-- <script>
let slideIndexes = { 'carousel1': 1, 'carousel2': 1 };
function moveSlide(n, carouselId) {
let slides = document.querySelector('#' + carouselId + ' .carousel-inner').getElementsByClassName("carousel-item");
slideIndexes[carouselId] += n;
if (slideIndexes[carouselId] > slides.length) {slideIndexes[carouselId] = 1}
if (slideIndexes[carouselId] < 1) {slideIndexes[carouselId] = slides.length}
for (let i = 0; i < slides.length; i++) {
slides[i].style.display = "none";
}
slides[slideIndexes[carouselId] - 1].style.display = "block";
}
// Initial display
document.addEventListener('DOMContentLoaded', function() {
moveSlide(0, 'carousel1');
moveSlide(0, 'carousel2');
});
</script> -->
<!-- Default Statcounter code for llm-editing
https://llm-editing.github.io/ -->
<script type="text/javascript">
var sc_project=13021589;
var sc_invisible=1;
var sc_security="137f28b6";
</script>
<script type="text/javascript"
src="https://www.statcounter.com/counter/counter.js"
async></script>
<noscript><div class="statcounter"><a title="Web Analytics"
href="https://statcounter.com/" target="_blank"><img
class="statcounter"
src="https://c.statcounter.com/13021589/0/137f28b6/1/"
alt="Web Analytics"
referrerPolicy="no-referrer-when-downgrade"></a></div></noscript>
<!-- End of Statcounter Code -->
</body>
<script>
function changeContent() {
const dropdown = document.getElementById("dropdown");
const selected = dropdown.value;
const sections = ["example_1", "example_2", "example_3", "example_4", "example_5", "example_6", "example_7", "example_8", "example_9", "example_10", "example_11", "example_12", "example_13", "example_14"];
sections.forEach((section) => {
document.getElementById(section).style.display = (section === selected) ? "block" : "none";
});
}
</script>
</html>