PYTHON实现智能文本摘要与关键词提取系统

安丨

于 2025-06-14 09:29:24 发布

阅读量411

点赞数 5

CC 4.0 BY-SA版权

文章标签： c# 开发语言

本文链接：https://blog.csdn.net/y131673/article/details/148648865

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from heapq import nlargest
import string
import numpy as np
import networkx as nx
from collections import defaultdict

class TextSummarizer:
    def __init__(self):
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        self.stop_words = set(stopwords.words('english') + list(string.punctuation))
        
    def preprocess_text(self, text):
        """预处理文本"""
        sentences = sent_tokenize(text)
        words = [word.lower() for word in word_tokenize(text) 
                if word.lower() not in self.stop_words and word.isalpha()]
        return sentences, words
    
    def calculate_word_scores(self, words):
        """计算单词重要性分数"""
        freq_dist = FreqDist(words)
        max_freq = max(freq_dist.values())
        
        word_scores = {}
        for word, freq in freq_dist.items():
            word_scores[word] = freq / max_freq
            
        return word_scores
    
    def calculate_sentence_scores(self, sentences, word_scores):
        """计算句子重要性分数"""
        sentence_scores = defaultdict(int)
        
        for i, sentence in enumerate(sentences):
            for word in word_tokenize(sentence.lower()):
                if word in word_scores:
                    sentence_scores[i] += word_scores[word]
            # 归一化处理
            sentence_scores[i] /= len(word_tokenize(sentence))
            
        return sentence_scores
    
    def summarize_text(self, text, num_sentences=3):
        """生成文本摘要"""
        sentences, words = self.preprocess_text(text)
        if len(sentences) <= num_sentences:
            return text
            
        word_scores = self.calculate_word_scores(words)
        sentence_scores = self.calculate_sentence_scores(sentences, word_scores)
        
        # 获取最重要的句子
        selected_indices = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
        summary = ' '.join([sentences[i] for i in sorted(selected_indices)])
        return summary
    
    def extract_keywords(self, text, num_keywords=5):
        """提取关键词"""
        _, words = self.preprocess_text(text)
        if not words:
            return []
            
        vectorizer = TfidfVectorizer(max_features=1000)
        tfidf_matrix = vectorizer.fit_transform([' '.join(words)])
        feature_array = np.array(vectorizer.get_feature_names_out())
        tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
        
        keywords = feature_array[tfidf_sorting][:num_keywords]
        return list(keywords)
    
    def summarize_with_graph(self, text, num_sentences=3):
        """使用TextRank算法生成摘要"""
        sentences, _ = self.preprocess_text(text)
        if len(sentences) <= num_sentences:
            return text
            
        vectorizer = TfidfVectorizer(stop_words=list(self.stop_words))
        sentence_vectors = vectorizer.fit_transform(sentences)
        
        # 构建句子相似度矩阵
        similarity_matrix = np.zeros((len(sentences), len(sentences)))
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    similarity_matrix[i][j] = (
                        sentence_vectors[i].dot(sentence_vectors[j].T) /
                        (np.linalg.norm(sentence_vectors[i]) * np.linalg.norm(sentence_vectors[j]))
                    )
        
        # 使用PageRank算法计算句子重要性
        nx_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(nx_graph)
        
        ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
        summary = ' '.join([s for _, s in ranked_sentences[:num_sentences]])
        return summary

if __name__ == '__main__':
    import argparse
    
    parser = argparse.ArgumentParser(description='智能文本摘要与关键词提取系统')
    parser.add_argument('text_file', help='输入文本文件路径')
    parser.add_argument('--sentences', type=int, default=3, help='摘要句子数量')
    parser.add_argument('--keywords', type=int, default=5, help='关键词数量')
    parser.add_argument('--algorithm', choices=['tfidf', 'textrank'], default='textrank', 
                       help='摘要算法选择')
    
    args = parser.parse_args()
    
    with open(args.text_file, 'r', encoding='utf-8') as f:
        text = f.read()
    
    summarizer = TextSummarizer()
    
    if args.algorithm == 'tfidf':
        summary = summarizer.summarize_text(text, args.sentences)
    else:
        summary = summarizer.summarize_with_graph(text, args.sentences)
    
    keywords = summarizer.extract_keywords(text, args.keywords)
    
    print("\n=== 文本摘要 ===")
    print(summary)
    print("\n=== 关键词 ===")
    print(', '.join(keywords))

使用说明

功能特点：
- 支持两种摘要算法(TF-IDF和TextRank)
- 自动提取核心关键词
- 保留原文重要信息
- 处理英文文本
- 可调节摘要长度

使用方法：

python text_summarizer.py input.txt --sentences 5 --keywords 10 --algorithm textrank

参数说明：
- text_file: 输入文本文件路径
- --sentences: 摘要包含的句子数量
- --keywords: 提取的关键词数量
- --algorithm: 摘要算法(tfidf或textrank)
应用场景：
- 新闻文章摘要
- 论文核心提取
- 文档快速预览
- 内容分析
- 信息检索优化