【Python综合应用】Python抓取 1-10页的嘻嘻哈哈

本文介绍了一个使用Python实现的简单网络爬虫程序，该程序能够抓取指定网站上的段子内容，并通过逐页遍历的方式进行数据抓取。文章详细展示了如何设置请求头、解析网页内容并提取所需数据。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

#coding=UTF-8 
from urllib.request import Request, urlopen,quote
from urllib.error import URLError
import chardet 
from bs4 import BeautifulSoup as BS
     
import sys  
import re    
# from readability.readability import Document  
# from html2text import html2text  



def __searchUrls(pageCur,pageTotal): 
    url = 'http://www.xxhh.com/duanzi/page/' + str(pageCur)   
    if pageCur > pageTotal:#获取前pageTotal页
        return
    else:    
        try:
            # print(pageCur) 
            # print(url)
            headers = {
            'User-Agent': 
            'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
            }  
            req = Request(url, headers=headers) 
            response = urlopen(req)
            content = response.read().decode('utf-8','ignore')    
            soup = BS(content)  
            # print(soup) 
            # f=open('xixihaha.txt', "a+",encoding='utf-8') #写入文件
            print ("★ 嘻嘻哈哈第【"+str(pageCur)+"】页")  
            for result_table in soup.findAll("div", {"class": "section"}):
                a_content =result_table.find("div",{"class": "user-section"})  
                a_href = a_content.find("a",{"class": "more flc80"}) 
                text = getContextByurl('http://www.xxhh.com'+a_href.get("href"))
                print(text+'\n'+'---------------------------------------------'+'\n') 
                # f.write(text+'\n'+'---------------------------------------------'+'\n')
        except URLError as e:
            if hasattr(e, 'reason'):
                print('We failed to reach a server.')
                print('Reason: ', e.reason)
            elif hasattr(e, 'code'):
                print('The server couldn\'t fulfill the request.')
                print('Error code: ', e.code)   
        pageCur = pageCur+ 1 
        __searchUrls(pageCur,pageTotal) 

def getContextByurl(url):
    try:  
        headers = {
            'User-Agent': 
            'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
            }  
        # print(url)
        req = Request(url, headers=headers) 
        response = urlopen(req)
        html = response.read().decode('utf-8','ignore')
        soup = BS(html)    
        # article =Document(html).summary()  
        # text = html2text(article) 
        div_text = soup.find("div", {"class": "article"})  
        return div_text.text
    except URLError as e:
        if hasattr(e, 'reason'):
            print('We failed to reach a server.')
            print('Reason: ', e.reason)
            return ''
        elif hasattr(e, 'code'):
            print('The server couldn\'t fulfill the request.')
            print('Error code: ', e.code) 
            return ''  

if __name__ == '__main__':
    __searchUrls(1,10)    #抓取第一页到第十页的嘻嘻哈哈