PYTHON实现智能文本摘要与关键词提取系统

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from heapq import nlargest
import string
import numpy as np
import networkx as nx
from collections import defaultdict

class TextSummarizer:
    def __init__(self):
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        self.stop_words = set(stopwords.words('english') + list(string.punctuation))
        
    def preprocess_text(self, text):
        """预处理文本"""
        sentences = sent_tokenize(text)
        words = [word.lower() for word in word_tokenize(text) 
                if word.lower() not in self.stop_words and word.isalpha()]
        return sentences, words
    
    def calculate_word_scores(self, words):
        """计算单词重要性分数"""
        freq_dist = FreqDist(words)
        max_freq = max(freq_dist.values())
        
        word_scores = {}
        for word, freq in freq_dist.items():
            word_scores[word] = freq / max_freq
            
        return word_scores
    
    def calculate_sentence_scores(self, sentences, word_scores):
        """计算句子重要性分数"""
        sentence_scores = defaultdict(int)
        
        for i, sentence in enumerate(sentences):
            for word in word_tokenize(sentence.lower()):
                if word in word_scores:
                    sentence_scores[i] += word_scores[word]
            # 归一化处理
            sentence_scores[i] /= len(word_tokenize(sentence))
            
        return sentence_scores
    
    def summarize_text(self, text, num_sentences=3):
        """生成文本摘要"""
        sentences, words = self.preprocess_text(text)
        if len(sentences) <= num_sentences:
            return text
            
        word_scores = self.calculate_word_scores(words)
        sentence_scores = self.calculate_sentence_scores(sentences, word_scores)
        
        # 获取最重要的句子
        selected_indices = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
        summary = ' '.join([sentences[i] for i in sorted(selected_indices)])
        return summary
    
    def extract_keywords(self, text, num_keywords=5):
        """提取关键词"""
        _, words = self.preprocess_text(text)
        if not words:
            return []
            
        vectorizer = TfidfVectorizer(max_features=1000)
        tfidf_matrix = vectorizer.fit_transform([' '.join(words)])
        feature_array = np.array(vectorizer.get_feature_names_out())
        tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
        
        keywords = feature_array[tfidf_sorting][:num_keywords]
        return list(keywords)
    
    def summarize_with_graph(self, text, num_sentences=3):
        """使用TextRank算法生成摘要"""
        sentences, _ = self.preprocess_text(text)
        if len(sentences) <= num_sentences:
            return text
            
        vectorizer = TfidfVectorizer(stop_words=list(self.stop_words))
        sentence_vectors = vectorizer.fit_transform(sentences)
        
        # 构建句子相似度矩阵
        similarity_matrix = np.zeros((len(sentences), len(sentences)))
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    similarity_matrix[i][j] = (
                        sentence_vectors[i].dot(sentence_vectors[j].T) /
                        (np.linalg.norm(sentence_vectors[i]) * np.linalg.norm(sentence_vectors[j]))
                    )
        
        # 使用PageRank算法计算句子重要性
        nx_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(nx_graph)
        
        ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
        summary = ' '.join([s for _, s in ranked_sentences[:num_sentences]])
        return summary

if __name__ == '__main__':
    import argparse
    
    parser = argparse.ArgumentParser(description='智能文本摘要与关键词提取系统')
    parser.add_argument('text_file', help='输入文本文件路径')
    parser.add_argument('--sentences', type=int, default=3, help='摘要句子数量')
    parser.add_argument('--keywords', type=int, default=5, help='关键词数量')
    parser.add_argument('--algorithm', choices=['tfidf', 'textrank'], default='textrank', 
                       help='摘要算法选择')
    
    args = parser.parse_args()
    
    with open(args.text_file, 'r', encoding='utf-8') as f:
        text = f.read()
    
    summarizer = TextSummarizer()
    
    if args.algorithm == 'tfidf':
        summary = summarizer.summarize_text(text, args.sentences)
    else:
        summary = summarizer.summarize_with_graph(text, args.sentences)
    
    keywords = summarizer.extract_keywords(text, args.keywords)
    
    print("\n=== 文本摘要 ===")
    print(summary)
    print("\n=== 关键词 ===")
    print(', '.join(keywords))

使用说明

  1. ​功能特点​​:

    • 支持两种摘要算法(TF-IDF和TextRank)
    • 自动提取核心关键词
    • 保留原文重要信息
    • 处理英文文本
    • 可调节摘要长度
  2. ​使用方法​​:

    python text_summarizer.py input.txt --sentences 5 --keywords 10 --algorithm textrank
  3. ​参数说明​​:

    • text_file: 输入文本文件路径
    • --sentences: 摘要包含的句子数量
    • --keywords: 提取的关键词数量
    • --algorithm: 摘要算法(tfidf或textrank)
  4. ​应用场景​​:

    • 新闻文章摘要
    • 论文核心提取
    • 文档快速预览
    • 内容分析
    • 信息检索优化

技术亮点

  1. 结合TF-IDF和TextRank算法
  2. 基于图模型的句子排序
  3. 关键词自动提取
  4. 文本预处理优化
  5. 可扩展的算法框架
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值