17.Python实战：Python自动获取网页文章_python 开源获取网页文章-CSDN博客

本文链接：https://blog.csdn.net/qq_14815605/article/details/145592638
import requests
from lxml import etree
import time
import os

class BlogCrawler:
    def __init__(self):
        # 初始化请求头，模拟浏览器访问，避免被网站识别为爬虫
        self.headers = {
            # 模拟 Chrome 浏览器的 user-agent
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            # 告知服务器客户端接受的内容类型
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            # 告知服务器客户端接受的语言
            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
            # 禁用缓存
            "cache-control": "no-cache",
            # 禁用缓存
            "pragma": "no-cache",
            # 来源页面的 URL
            "referer": "https://wuchong.me/"
        }
        # 博客的基础 URL
        self.base_url = "https://wuchong.me"

    def get_article_links(self):
        """获取所有文章链接"""
        try:
            print("正在获取文章列表...")
            # 发送 GET 请求获取博客首页内容，设置超时时间为 10 秒
            resp = requests.get(self.base_url, headers=self.headers, timeout=10)
            # 设置响应内容的编码为 UTF-8
            resp.encoding = "utf-8"

            # 检查响应状态码，如果不是 200 则表示请求失败
            if resp.status_code != 200:
                print(f"获取文章列表失败，状态码：{resp.status_code}")
                return []

            # 将响应的 HTML 文本解析为可操作的 Element 对象
            e = etree.HTML(resp.text)

            # 修改 xpath 选择器以适应网站结构，选取所有包含 "post" 类的 article 元素
            articles = e.xpath('//article[contains(@class, "post")]')
            # 用于存储文章的标题和链接元组的列表
            article_links = []

            # 遍历所有文章元素
            for article in articles:
                try:
                    # 获取文章标题元素的文本内容
                    title_element = article.xpath('.//a[@class="post-title-link"]/text()')
                    # 获取文章链接元素的 href 属性值
                    link_element = article.xpath('.//a[@class="post-title-link"]/@href')

                    # 检查标题和链接元素是否存在
                    if title_element and link_element:
                        # 去除标题前后的空白字符
                        title = title_element[0].strip()
                        link = link_element[0]

                        # 如果链接不是以 http 开头，则拼接基础 URL
                        if not link.startswith('http'):
                            link = self.base_url + link

                        # 将标题和链接作为元组添加到文章链接列表中
                        article_links.append((title, link))
                        print(f"找到文章：{title}")
                except Exception as e:
                    print(f"处理单个文章链接时出错：{str(e)}")
                    continue

            return article_links

        except Exception as e:
            print(f"获取文章列表时发生错误：{str(e)}")
            return []

    def fetch_article(self, url):
        """获取单篇文章的内容"""
        try:
            print(f"正在获取文章内容：{url}")
            # 发送 GET 请求获取文章页面内容，设置超时时间为 10 秒
            resp = requests.get(url, headers=self.headers, timeout=10)
            # 设置响应内容的编码为 UTF-8
            resp.encoding = "utf-8"

            # 检查响应状态码，如果不是 200 则表示请求失败
            if resp.status_code != 200:
                print(f"获取文章内容失败，状态码：{resp.status_code}")
                return None

            # 将响应的 HTML 文本解析为可操作的 Element 对象
            e = etree.HTML(resp.text)

            # 更新 xpath 选择器，获取文章标题
            title = e.xpath(
                '//h1[@class="post-title"]/text()')
            # 如果标题存在则去除前后空白字符，否则设置为 "未知标题"
            title = ''.join(title).strip() if title else "未知标题"

            # 获取文章发布时间
            publish_date = e.xpath(
                '//div[@class="post-meta"]//time/text()')
            # 去除发布时间前后的空白字符
            publish_date = ''.join(publish_date).strip()

            # 获取文章内容的元素，包括段落、二级标题、三级标题、无序列表和有序列表
            content_elements = e.xpath(
                '//div[contains(@class, "post-body")]//p | //div[contains(@class, "post-body")]//h2 | //div[contains(@class, "post-body")]//h3 | //div[contains(@class, "post-body")]//ul | //div[contains(@class, "post-body")]//ol')

            # 用于存储文章内容的列表
            article_content = []

            # 遍历文章内容元素
            for element in content_elements:
                try:
                    # 如果元素是段落标签
                    if element.tag == 'p':
                        # 获取段落内的所有文本并去除前后空白字符
                        text = ''.join(element.xpath('.//text()')).strip()
                        # 如果文本不为空则添加到文章内容列表中
                        if text:
                            article_content.append(text)

                    # 如果元素是二级或三级标题标签
                    elif element.tag in ['h2', 'h3']:
                        # 获取标题内的所有文本并去除前后空白字符
                        text = ''.join(element.xpath('.//text()')).strip()
                        # 如果文本不为空则格式化为 Markdown 标题形式添加到文章内容列表中
                        if text:
                            article_content.append(f"\n{'#' * (int(element.tag[1]))} {text}\n")

                    # 如果元素是无序列表或有序列表标签
                    elif element.tag in ['ul', 'ol']:
                        # 获取列表内的所有列表项元素
                        list_items = element.xpath('.//li')
                        print(list_items)
                        # 遍历列表项元素
                        for i, item in enumerate(list_items, 1):
                            # 获取列表项内的所有文本并去除前后空白字符
                            item_text = ''.join(item.xpath('.//text()')).strip()
                            if item_text:
                                # 处理多行文本，保持缩进
                                lines = item_text.split('\n')
                                # 添加列表项的第一行文本
                                article_content.append(f"{i}. {lines[0].strip()}")
                                # 处理列表项的后续行文本，添加缩进
                                for line in lines[1:]:
                                    if line.strip():
                                        article_content.append(f"   {line.strip()}")
                        # 添加空行分隔列表
                        article_content.append("")

                except Exception as e:
                    print(f"处理内容元素时出错：{str(e)}")
                    continue

            # 返回包含文章标题、发布时间、内容和 URL 的字典
            return {
                "title": title,
                "publish_date": publish_date,
                "content": article_content,
                "url": url
            }

        except Exception as e:
            print(f"抓取文章时发生错误：{str(e)}")
            return None

    def save_article(self, article_data, output_dir="articles"):
        """保存文章到文件"""
        try:
            # 检查输出目录是否存在，如果不存在则创建
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            # 过滤标题中的非法字符，生成安全的文件名
            safe_title = "".join(x for x in article_data["title"] if x.isalnum() or x in (' ', '-', '_')).strip()
            # 如果安全标题为空，则设置为 "untitled"
            if not safe_title:
                safe_title = "untitled"
            # 生成文件名
            filename = f"{output_dir}/{safe_title}.txt"

            print(f"正在保存文章：{filename}")

            # 以写入模式打开文件，设置编码为 UTF-8
            with open(filename, 'w', encoding='utf-8') as f:
                # 写入文章标题
                f.write(f"标题：{article_data['title']}\n")
                # 写入文章发布时间
                f.write(f"发布时间：{article_data['publish_date']}\n")
                # 写入文章原文链接
                f.write(f"原文链接：{article_data['url']}\n")
                # 写入分隔线
                f.write("\n" + "=" * 50 + "\n\n")
                # 写入文章内容
                f.write("\n".join(article_data['content']))

            print(f"文章已保存：{filename}")

        except Exception as e:
            print(f"保存文章时发生错误：{str(e)}")

    def crawl_all_articles(self):
        """爬取所有文章"""
        # 获取所有文章的链接
        article_links = self.get_article_links()
        # 如果没有找到文章链接，则输出提示信息并返回
        if not article_links:
            print("未找到任何文章链接")
            return

        print(f"\n共发现 {len(article_links)} 篇文章")

        # 遍历文章链接列表
        for i, (title, link) in enumerate(article_links, 1):
            print(f"\n[{i}/{len(article_links)}] 正在抓取: {title}")
            print(f"\n链接为：{link}")
            # 获取单篇文章的内容
            article_data = self.fetch_article(link)

            # 如果成功获取文章内容，则保存文章
            if article_data:
                self.save_article(article_data)
                print(f"文章 '{title}' 已保存")
            else:
                print(f"文章 '{title}' 抓取失败")

            # 添加随机延时，避免请求过于频繁
            time.sleep(2)


if __name__ == "__main__":
    try:
        # 创建博客爬虫对象
        crawler = BlogCrawler()
        # 启动爬取所有文章的操作
        crawler.crawl_all_articles()
    except KeyboardInterrupt:
        print("\n程序被用户中断")
    except Exception as e:
        print(f"程序运行出错：{str(e)}")