Evaluating robustness
To evaluate the robustness of an LLM, we can measure its performance on both clean and adversarial inputs:
def evaluate_robustness( model, tokenizer, test_dataset, attack_function ): model.eval() clean_preds, adv_preds, labels = [], [], [] for item in test_dataset: inputs = tokenizer(item['text'], return_tensors='pt', padding=True, truncation=True) with torch.no_grad(): clean_output = model(inputs).logits clean_preds.append(torch.argmax(clean_output, dim=1).item()) adv_text = attack_function(item['text'...