# -*- coding: utf-8 -*-
import pandas as pd
import random
import nltk
from nltk.corpus import wordnet
import jieba
from nltk.corpus import stopwords
# 下载必要的 NLTK 数据
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
data=pd.read_excel('训练数据-合.xlsx')
df = pd.DataFrame(data)
# 中文分词
def chinese_tokenize(sentence):
return list(jieba.cut(sentence))
# 获取同义词(使用WordNet作为例子)
def get_synonyms(word):
synonyms = set()
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonyms.add(lemma.name())
return list(synonyms)
# 同义词替换,使用自定义的词库替换
def synonym_replacement(sentence, n=1):
words = chinese_tokenize(sentence)
new_words = words.copy()
random_word_list = list(set([word for word in words if word.isalpha() and word not in stopwords.words('english')]))
random.shuffle(random_word_list)
num_replaced = 0
for random_word in random_word_list:
synonyms = get_synonyms(random_word)
if len(synonyms) >= 1:
synonym = random.choice(synonyms)
new_words = [synonym if word == random_word else word for word in new_words]
num_replaced += 1
if num_replaced >= n:
break
return ''.join(new_words)
# 随机插入,增强文本
def random_insertion(sentence, n=1):
words = chinese_tokenize(sentence)
new_words = words.copy()
for _ in range(n):
word = random.choice(new_words)
synonyms = get_synonyms(word)
if len(synonyms) >= 1:
synonym = random.choice(synonyms)
new_words.insert(random.randint(0, len(new_words)), synonym)
return ''.join(new_words)
# 随机删除,增加句子的多样性
def random_deletion(sentence, p=0.1):
words = chinese_tokenize(sentence)
if len(words) == 1:
return sentence
new_words = []
for word in words:
if random.uniform(0, 1) > p:
new_words.append(word)
if len(new_words) == 0:
return random.choice(words)
return ''.join(new_words)
# 随机交换位置,打乱词语顺序
def random_swap(sentence, n=1):
words = chinese_tokenize(sentence)
new_words = words.copy()
# 只有当句子的单词数大于等于 2 时,才进行交换
if len(new_words) >= 2:
for _ in range(n):
idx1, idx2 = random.sample(range(len(new_words)), 2)
new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
return ''.join(new_words)
# 增强句子
def augment_sentence(sentence, num_aug=3): # 修改为 5 条增强数据
augmented_sentences = []
for _ in range(num_aug):
augmented_sentence = synonym_replacement(sentence)
augmented_sentences.append(augmented_sentence)
augmented_sentence = random_insertion(sentence)
augmented_sentences.append(augmented_sentence)
augmented_sentence = random_deletion(sentence)
augmented_sentences.append(augmented_sentence)
augmented_sentence = random_swap(sentence)
augmented_sentences.append(augmented_sentence)
return augmented_sentences[:5] # 确保每个样本只增加 5 条数据
augmented_data = []
# 生成增强后的数据
for index, row in df.iterrows():
if row['sentiment'] in [1, -1]:
augmented_sentences = augment_sentence(row['comment'])
for sentence in augmented_sentences:
augmented_data.append({'comment': sentence, 'sentiment': row['sentiment']})
# 创建增强后的 DataFrame
augmented_df = pd.DataFrame(augmented_data)
# 将原始数据和增强数据合并
final_df = pd.concat([df, augmented_df], ignore_index=True)
# print(final_df)
# 保存为 Excel 文件
final_df.to_excel('训练数据增强之后的.xlsx', index=False)