#coding=UTF-8
from urllib.request import Request, urlopen,quote
from urllib.error import URLError
import chardet
from bs4 import BeautifulSoup as BS
import sys
import re
# from readability.readability import Document
# from html2text import html2text
def __searchUrls(pageCur,pageTotal):
url = 'http://www.xxhh.com/duanzi/page/' + str(pageCur)
if pageCur > pageTotal:#获取前pageTotal页
return
else:
try:
# print(pageCur)
# print(url)
headers = {
'User-Agent':
'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
}
req = Request(url, headers=headers)
response = urlopen(req)
content = response.read().decode('utf-8','ignore')
soup = BS(content)
# print(soup)
# f=open('xixihaha.txt', "a+",encoding='utf-8') #写入文件
print ("★ 嘻嘻哈哈第【"+str(pageCur)+"】页")
for result_table in soup.findAll("div", {"class": "section"}):
a_content =result_table.find("div",{"class": "user-section"})
a_href = a_content.find("a",{"class": "more flc80"})
text = getContextByurl('http://www.xxhh.com'+a_href.get("href"))
print(text+'\n'+'---------------------------------------------'+'\n')
# f.write(text+'\n'+'---------------------------------------------'+'\n')
except URLError as e:
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
elif hasattr(e, 'code'):
print('The server couldn\'t fulfill the request.')
print('Error code: ', e.code)
pageCur = pageCur+ 1
__searchUrls(pageCur,pageTotal)
def getContextByurl(url):
try:
headers = {
'User-Agent':
'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
}
# print(url)
req = Request(url, headers=headers)
response = urlopen(req)
html = response.read().decode('utf-8','ignore')
soup = BS(html)
# article =Document(html).summary()
# text = html2text(article)
div_text = soup.find("div", {"class": "article"})
return div_text.text
except URLError as e:
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
return ''
elif hasattr(e, 'code'):
print('The server couldn\'t fulfill the request.')
print('Error code: ', e.code)
return ''
if __name__ == '__main__':
__searchUrls(1,10) #抓取第一页到第十页的嘻嘻哈哈