import requests
from lxml import etree
import time
import os
class BlogCrawler:
def __init__(self):
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"cache-control": "no-cache",
"pragma": "no-cache",
"referer": "https://wuchong.me/"
}
self.base_url = "https://wuchong.me"
def get_article_links(self):
"""获取所有文章链接"""
try:
print("正在获取文章列表...")
resp = requests.get(self.base_url, headers=self.headers, timeout=10)
resp.encoding = "utf-8"
if resp.status_code != 200:
print(f"获取文章列表失败,状态码:{resp.status_code}")
return []
e = etree.HTML(resp.text)
articles = e.xpath('//article[contains(@class, "post")]')
article_links = []
for article in articles:
try:
title_element = article.xpath('.//a[@class="post-title-link"]/text()')
link_element = article.xpath('.//a[@class="post-title-link"]/@href')
if title_element and link_element:
title = title_element[0].strip()
link = link_element[0]
if not link.startswith('http'):
link = self.base_url + link
article_links.append((title, link))
print(f"找到文章:{title}")
except Exception as e:
print(f"处理单个文章链接时出错:{str(e)}")
continue
return article_links
except Exception as e:
print(f"获取文章列表时发生错误:{str(e)}")
return []
def fetch_article(self, url):
"""获取单篇文章的内容"""
try:
print(f"正在获取文章内容:{url}")
resp = requests.get(url, headers=self.headers, timeout=10)
resp.encoding = "utf-8"
if resp.status_code != 200:
print(f"获取文章内容失败,状态码:{resp.status_code}")
return None
e = etree.HTML(resp.text)
title = e.xpath(
'//h1[@class="post-title"]/text()')
title = ''.join(title).strip() if title else "未知标题"
publish_date = e.xpath(
'//div[@class="post-meta"]//time/text()')
publish_date = ''.join(publish_date).strip()
content_elements = e.xpath(
'//div[contains(@class, "post-body")]//p | //div[contains(@class, "post-body")]//h2 | //div[contains(@class, "post-body")]//h3 | //div[contains(@class, "post-body")]//ul | //div[contains(@class, "post-body")]//ol')
article_content = []
for element in content_elements:
try:
if element.tag == 'p':
text = ''.join(element.xpath('.//text()')).strip()
if text:
article_content.append(text)
elif element.tag in ['h2', 'h3']:
text = ''.join(element.xpath('.//text()')).strip()
if text:
article_content.append(f"\n{'#' * (int(element.tag[1]))} {text}\n")
elif element.tag in ['ul', 'ol']:
list_items = element.xpath('.//li')
print(list_items)
for i, item in enumerate(list_items, 1):
item_text = ''.join(item.xpath('.//text()')).strip()
if item_text:
lines = item_text.split('\n')
article_content.append(f"{i}. {lines[0].strip()}")
for line in lines[1:]:
if line.strip():
article_content.append(f" {line.strip()}")
article_content.append("")
except Exception as e:
print(f"处理内容元素时出错:{str(e)}")
continue
return {
"title": title,
"publish_date": publish_date,
"content": article_content,
"url": url
}
except Exception as e:
print(f"抓取文章时发生错误:{str(e)}")
return None
def save_article(self, article_data, output_dir="articles"):
"""保存文章到文件"""
try:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
safe_title = "".join(x for x in article_data["title"] if x.isalnum() or x in (' ', '-', '_')).strip()
if not safe_title:
safe_title = "untitled"
filename = f"{output_dir}/{safe_title}.txt"
print(f"正在保存文章:{filename}")
with open(filename, 'w', encoding='utf-8') as f:
f.write(f"标题:{article_data['title']}\n")
f.write(f"发布时间:{article_data['publish_date']}\n")
f.write(f"原文链接:{article_data['url']}\n")
f.write("\n" + "=" * 50 + "\n\n")
f.write("\n".join(article_data['content']))
print(f"文章已保存:{filename}")
except Exception as e:
print(f"保存文章时发生错误:{str(e)}")
def crawl_all_articles(self):
"""爬取所有文章"""
article_links = self.get_article_links()
if not article_links:
print("未找到任何文章链接")
return
print(f"\n共发现 {len(article_links)} 篇文章")
for i, (title, link) in enumerate(article_links, 1):
print(f"\n[{i}/{len(article_links)}] 正在抓取: {title}")
print(f"\n链接为:{link}")
article_data = self.fetch_article(link)
if article_data:
self.save_article(article_data)
print(f"文章 '{title}' 已保存")
else:
print(f"文章 '{title}' 抓取失败")
time.sleep(2)
if __name__ == "__main__":
try:
crawler = BlogCrawler()
crawler.crawl_all_articles()
except KeyboardInterrupt:
print("\n程序被用户中断")
except Exception as e:
print(f"程序运行出错:{str(e)}")