import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from heapq import nlargest
import string
import numpy as np
import networkx as nx
from collections import defaultdict
class TextSummarizer:
def __init__(self):
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
self.stop_words = set(stopwords.words('english') + list(string.punctuation))
def preprocess_text(self, text):
"""预处理文本"""
sentences = sent_tokenize(text)
words = [word.lower() for word in word_tokenize(text)
if word.lower() not in self.stop_words and word.isalpha()]
return sentences, words
def calculate_word_scores(self, words):
"""计算单词重要性分数"""
freq_dist = FreqDist(words)
max_freq = max(freq_dist.values())
word_scores = {}
for word, freq in freq_dist.items():
word_scores[word] = freq / max_freq
return word_scores
def calculate_sentence_scores(self, sentences, word_scores):
"""计算句子重要性分数"""
sentence_scores = defaultdict(int)
for i, sentence in enumerate(sentences):
for word in word_tokenize(sentence.lower()):
if word in word_scores:
sentence_scores[i] += word_scores[word]
# 归一化处理
sentence_scores[i] /= len(word_tokenize(sentence))
return sentence_scores
def summarize_text(self, text, num_sentences=3):
"""生成文本摘要"""
sentences, words = self.preprocess_text(text)
if len(sentences) <= num_sentences:
return text
word_scores = self.calculate_word_scores(words)
sentence_scores = self.calculate_sentence_scores(sentences, word_scores)
# 获取最重要的句子
selected_indices = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
summary = ' '.join([sentences[i] for i in sorted(selected_indices)])
return summary
def extract_keywords(self, text, num_keywords=5):
"""提取关键词"""
_, words = self.preprocess_text(text)
if not words:
return []
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform([' '.join(words)])
feature_array = np.array(vectorizer.get_feature_names_out())
tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
keywords = feature_array[tfidf_sorting][:num_keywords]
return list(keywords)
def summarize_with_graph(self, text, num_sentences=3):
"""使用TextRank算法生成摘要"""
sentences, _ = self.preprocess_text(text)
if len(sentences) <= num_sentences:
return text
vectorizer = TfidfVectorizer(stop_words=list(self.stop_words))
sentence_vectors = vectorizer.fit_transform(sentences)
# 构建句子相似度矩阵
similarity_matrix = np.zeros((len(sentences), len(sentences)))
for i in range(len(sentences)):
for j in range(len(sentences)):
if i != j:
similarity_matrix[i][j] = (
sentence_vectors[i].dot(sentence_vectors[j].T) /
(np.linalg.norm(sentence_vectors[i]) * np.linalg.norm(sentence_vectors[j]))
)
# 使用PageRank算法计算句子重要性
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
summary = ' '.join([s for _, s in ranked_sentences[:num_sentences]])
return summary
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='智能文本摘要与关键词提取系统')
parser.add_argument('text_file', help='输入文本文件路径')
parser.add_argument('--sentences', type=int, default=3, help='摘要句子数量')
parser.add_argument('--keywords', type=int, default=5, help='关键词数量')
parser.add_argument('--algorithm', choices=['tfidf', 'textrank'], default='textrank',
help='摘要算法选择')
args = parser.parse_args()
with open(args.text_file, 'r', encoding='utf-8') as f:
text = f.read()
summarizer = TextSummarizer()
if args.algorithm == 'tfidf':
summary = summarizer.summarize_text(text, args.sentences)
else:
summary = summarizer.summarize_with_graph(text, args.sentences)
keywords = summarizer.extract_keywords(text, args.keywords)
print("\n=== 文本摘要 ===")
print(summary)
print("\n=== 关键词 ===")
print(', '.join(keywords))
使用说明
-
功能特点:
- 支持两种摘要算法(TF-IDF和TextRank)
- 自动提取核心关键词
- 保留原文重要信息
- 处理英文文本
- 可调节摘要长度
-
使用方法:
python text_summarizer.py input.txt --sentences 5 --keywords 10 --algorithm textrank
-
参数说明:
text_file
: 输入文本文件路径--sentences
: 摘要包含的句子数量--keywords
: 提取的关键词数量--algorithm
: 摘要算法(tfidf或textrank)
-
应用场景:
- 新闻文章摘要
- 论文核心提取
- 文档快速预览
- 内容分析
- 信息检索优化
技术亮点
- 结合TF-IDF和TextRank算法
- 基于图模型的句子排序
- 关键词自动提取
- 文本预处理优化
- 可扩展的算法框架