import requests
import json
from lxml import etree
import time
'''
注意,河北省博物馆这个网站。从第二页开始是这样匹配的replys = '///*[@class="liuyantiaotaio t110"]/tr/td[2]/p/span/text()'
第一页的回复语句replys = '///*[@class="liuyantiaotaio t110"]/tr/td[2]/p/text()' 就是没有这个span
'''
'''
自己总结的经验:
对于每页不规则网站一定要检查,这里是二分法爬取,比如共100页,先爬前50页,没问题,爬后50页,如果前50页有问题,
在前50页里面二分法爬取,一点一点定位,肯定有几页数据结构和其他页不一致,导致爬取顺序不一样,找出来,手工填写
'''
def get_info(i, data=None):
headers = {
"Content-Type": "application/json; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
}
url = "http://www.hebeimuseum.org.cn/api/stl/actions/dynamic"
# 因为下一页是的href是js,有两种方法爬,第一种用selenium,第二种就是构造post请求,页码在post请求中展示出来,我用的这种方式
# 这种方式注意,爬取完是json格式的,我们得取他的value值,在转化成html的形式,这样结构就和原网页有点差距,xpath匹配时不能从头匹配了,我们打印出来,看内容自己写匹配语句。
# 那个页面 翻页是通过 ajax 异步刷新的, 所以只抓ajax调用的接口就行
formate = {
"publishmentSystemId": 1,
"pageNodeId": 60,
"pageContentId": 0,
"pageTemplateId": 4,
"isPageRefresh": "false",
"pageUrl": "qWZfgSqqIHil4y1y2m0add0HCx36j7GwwMx7",
"ajaxDivId": "ajaxElement_1_915",
"templateContent": "DeZxK0slash0baVKROiBLGeDyY0add0tpHWRP5U7Snmn1fk4CxMptxMyokRljqbhMdYw5hHv2kkU0vcLVHiO7LP0slash04mOguMi2N6JwJjQrfLzRRuGZZRxi0add0jDtNWNG2PbQKfotOQjtwLrxDadDlrG0slash0TGi5PRGy0V93AVA6rNRyk0U4Nr8n0add0j9foj0slash01zQ0add0sa4VXe00LT2oKeQ2y5ncLsy3hfkmfikOBX0slash0pXSG8sLohY06yUKdTRuD9Tnywz59vTYefqU1HDRdO9ly5TjyYqJ8gfPnzr1tSWAH3YNnS0slash0ZzbGFRbbDHdosEOek62MuT2YF5a2XmoHQYRjz9ZI4l6HrSd0yHATgTPn0add01wZ7A0add0ycQWveMZ9BXqIW1Avp0add0xbvbt4T8m02RIS6dqev483x7bZbeY2mfOiU0slash0QG2pTEk1p7fcF6zgjeAgQqcqyVt8N0Jxk0add0lczv5Kx1af0add0Ok0add0gc0rZEIy0add0mdcsFGI0add0Dy4TriGZXJX0slash0q05Akq6YypBuYymnVC8ikevd0OkU45rJF8PVYWTzbiUAeE8s6840add0PORifSZcSjWFqW5hV0momnJMv6YRooaZHiiGYAIIpZ0o0add0MpfmtJtFzF66zuh7hIa91oHN29XZfY47rOAgkgFOZwgrxatl4qU4BUm83MtE0UvycxD50add0zLwsnbcEKGo0Cucw3cwNPbl9z7rul15Q8FFarOR7mmpQ9NbcKZ3gSsrueoFNDTTYyB2kVawjIKCXK5q4QCs9E1TAN7wc0hlqmDzqu0slash0fo4a8p1gcKIByUNlIDRI8D1JOPxO6cIkCFNxg62UCrOexaZIRMJxpRXf4BroBMRcTZI31rruLcRkifaiyPt1yzxBZkw3PG0slash0htzChEynApJyODoYMzos5NSVpqiBG7EtN8Y9QUQPCvv1evrI1B8nMIQ0slash0LQCQD4rOBbA0add00slash0RrhpMnIcgng3x0slash02exwiX0NHS0slash0lnGoGMCVsKcRp2d10OxxLlXjEBPi3XhVV1k8R9kMV8L0add0B2mPFFT2ZwTj81M50slash0zWKg3nF6FagHAPFDflvBNJKOi3PugDIvdooyep1FXSv5RIUpg46MZKr0tOtIf2pJfcIe466tQiE4T0slash0TKikzkdPWIOCrmSk5PHV1UDB0Di6TISJ7bE0XacbkDYnYtQYiEpid4s23sDPtu9XWvPBM0add0klfHrz3PGVeI0ciUlhrGG5YaPECL7an8FUOPHLRoRlu1aq0add0WHsCrwvwn8QYGo5HtGoWFsvay9PjnAfIOLY2rTcj4mMcowo0slash0jARrUs71LZ8NZglOqSaJgKNOFQibltejsSHOnKEICLqFImVBwQ4aEhfxr15Nldsd3j1uMexU7veCfCFsZqWIm2zRdsHAMBcZCf0add0XnGnP8ApksEqzMEB0add0Z5uFKNH79fBZxfOnYgEP0hmC3mdEvP0slash0udiOmLXh2a28pcX82emoDmCqncyWnla0wGToj09ORl0Ftydmsljetez33r8HyGRN2ZYbKOWCH2n0add0xgmnloL653bXtSoPy8X5NjC0slash0cCS4zr6nJO0add0WMCrsZDzdKuZI0add0ay2BHUfITgPmrWz0nXjBiJZBcuAFZ0lTsXd5BaP4lNcUWHHMdKpakapteKiKPDnAUmpZwy6uBZA6T7eXNYdCFWC91PjkxkS3wsgk5gDu7v8QFY65uypedXTj4wqqvboCqPcGy0slash0vhBBwzIoE090vRARy9Jbqf7SV0X0add0tp2rwBHUSHTd0b7490faiv7PsGXNjWbLlcIy4X0add0ImG0add05tZrldezLFk0slash0OoJi1y36jZHdVBPAyZx0Om3GPYYOxU0ur0QxMKOR8gbl54nIOQFJ9huryOYfzr0slash0ilFN2VSWYNIXCDS0slash0IUhj4gP9pEizVNc8Ae4IDwod0VCESghaIsD4ZVVZVprjOR0z9WkeC1YMjFGVoPolvSRAx86B1e70slash0YAp0add0eqKPsva77YX0NW2YzhmWDMTydVrjUWskWM54e0slash00add0gFwmlCxHPo0tEI7OryGPptJZ90IzS0Oe0wD20dXDrLSz0add02FQn9mdRCm4ZDczpg4CK5q0add0Ma7cjAVvl1Ews3rzkCe0s33opfqhcxagAGHUZisB0slash0XjOhwpbXavoTtN0add0rSp4ym7yPPxFhXOJJoqoQK4LWWkcNVZpRsv0add0VqC6iuGK1rjYLoJRsgMsKa85k7fgTr0O3j9SHY65ReRFMlZV3ojQ5acE97rNP9fHOGH8l49WFl0add0rKSHT8iAzY7u7knxBhd2WB0slash0hdI9WPdMa27MOYaLoQvgr0add0SZIjDty30add0eyJHyhj8I3Gzg632AdmyVQ8JFoPmnlz5KE5nrZE5R0add0ZzbcTcS2dM91Dt34yVuVSJrY5Eers2MXPim615Y0add05cwMqHICB2Ba0slash0qMo9wd0slash0CMwMOTy76VvyUm2og0slash0RDQlrC0slash0iKvDE0slash02zbf0slash0ZcHzx6ovAFJUEbmMXrIHEkHO7haD4otCeADOc7oZJQyLIASu4B4nmB6cmtPVbpi50add0RFd2DWHF0add0f6C0add00add0wnTlcLf0add0E9aYmK5HpEVm3kBQjhSXETgsXUchQ2ui4Xi2rhNqd0add01uruaVxkWSf0add0423P0slash0Hs2aay435Yg3vTv5dcqdmsWWefpwnvy53o70N0add0ODi0vcfy8b0add0MJF0slash0igaa6nHphUwEgm1nN51AvjGGLmgHxD7gflYR2Nfs2QiBRJDDQRyNrlGH9g30slash0EqChsN6ZVbLi0l0TVOUpGl0slash0xMtgD0slash0ndhOA8dhoZF0add0zie3kO28JBbTtp8QXJkKMqdZ2WYsJIKw9Fbjv1QCs1S90add06D0OpPpb8peMNrIhSUA9QqtI3ddBJJxDrB0slash0vuYDyVpH71GdUrLjuiijfyLduWasEE7qAJT9n0rXOj6bKMwt8sp0slash0ETW0slash0J0add0yddSpoWTzE07r3lKEtdk0add0MDavZN52gSQRkksHV9oLoVE0slash00slash0pYdB21G4HVVU14Luer0add0Mcrde1Vh7Ldj5nbODKgatfJT4r2afOe1zqnSQTI7Tb8QudUSsYGDOIxXYa9ZRKKc90add0x1EaUqOLo4DGzCagAT6RECIU2cRS8u0slash0XIF9sYTtxDyONUeDpoIo6vgauwaacmk0add0cHhrbkzfF0add0o0qeYurIXVzPM4mc5eEE7DwqJuAnmQHRPHuI0slash03OwF31V045xdcOhpkqtDuDxrvJ8T0slash0rC10add04Xn9fxDwVFe0laUdyoasCBjufJug5LkCsGT3LyXMuFADNKNBGewFwq4eq0add0ThK2O0slash0865pdD0gZrCPc8EWEEcqN3BWyPCCm1acbBpxsGphaCTDqMQoOV3dx9byzJvf9k2S9LxIqbOu7FH4qlY0add0oRhj9B5UWHO0slash0DhYJ3IYhFWKiJ0add0vX3jyHNDF5QOotN32ipDlV67q0hFvWYXssesP3SkRQeEz0add0RNZpwiEnuQli0K8JB0add0S0slash0yyJi14G6IB5A1N0add0jxTUH86EhdfddoPP3CdNx2hwoPc03OvqubBJsJviqk0slash0js74xPN2ZmzKvsWSDKFa9p8CuARfMH3BlqE2cjtDZDWyaP4QL3m8RIt5aH3eMB4MW7eecG8aaJ4KXX8k9OjdTFcBcJndTKKBW9GnA3Rv4bsNiC8RSaQb15fFcvxstW1ObjBd0add0H6pj0oB1yscEcIO1qOoyPcy02zaMeiCfdsDWSGzwxfkuIzQX0gWcbX4hw0add0HHbAdLdsMHeOxGj0add00slash0BPuW8RmXJ32Va0add0jVSKLjnsh710slash0iHmz44n4cYjtGCCYA1np8nLJg60BredlPqtscaiBxfPLGmXp7caMbi3nxu6FLGLHTled5NukJOj0Lve8ZwoDeGL4arOyNB8xgsvDg0add096jQVMyF9Zk0add0AykUZnY0slash0i0slash0J7wHvYlq0DP7BCyrThyFtjzv8I7lrrSJ7TGP3lJIFKZBt6PXlSBQ0C9BnEgYmymmYTU4YVL0slash0anXvdLhjW3ouoK9XPvlE7W3IV0NLNX9dBX0slash07WHQrttxmzzyo1VfvQKicE9D8CXbRlppBLqHrqp0slash0QnGtKaPnEZ3PlDVigL5rJSd825PYBA0WOsFwDdCbezCEUBKf3SyHZwj69Tq0add0iRz69f4CET0slash0k70add0RXkvQEHBpgCaNSNzm0add0XLRA1NGZ8aUBvSb1WGO33Uztf0add0eal0add0pzlOGwOeBZ97JR0slash00mBzbQJC9mDEeB5h3T91oTWHE0B0I7yyFJZvd5xy188HEoDeGQxdZDZ1E0add00slash02SxWsGpIpYefarYXxhx0qCgHJz0O7Z2x2aq1gnCULPWSm0add0DxeA0slash0gRV6tvVXqBU7k7k5T22kLJRoq3OW9IS7GM55pKB0FLVNCrurrdUGIaccl4QjCpIFHyqJkw7DuOSvLDs52vsK6gNGyAtjkll2H3hzLJA5cRXl0TVqy1Flg0GrcJzQBTZ6l2sZ0add00add09axBOBcSRBkVVkuzgeJZgbvSOE6JAe9Qk49PzImACmgpfmr0dorLYkPvdOPtAjejcHHUdknpQHst3DZk8MEINluyM2QfgKd1jTSl6Yh8QcxsFYBjePz0add0jeZHLUZ6iEyUNd4q0add0rmxy1BpI2qL2wUl6x6PtE1Aca8T1IAW2sd92S2PJBugG5acXUGcXBy0slash0oMmd9IB89HR0add0TT2mstMWcIw0slash0bp10slash0qEkafFarpkC0add05JuKPLkoFnax2RCoG9BInnptesz34CLY0ZTGFTGXl54JKgvATPH0add058Wo6LveRz0uWiUvKhpRCEg0Sg52pZEJeAGZODh75syLuwDgi80slash0kGnXiVuOr60Gkm9JvLhfLVAjOwBfs35C6FHkW08To72VLtUnC73dnzYtNImN8rhgx",
"pageNum": i
}
response = requests.post(url, data=json.dumps(formate), headers=headers).json()
response = response['html']
content = etree.HTML(response)
# 问答
questions = content.xpath(
'///*[@class="t110"]/tr/td/text()')
# 回复
replys = content.xpath(
'///*[@class="liuyantiaotaio t110"]/tr/td[2]/p/text()')
# 发布时间
times = content.xpath(
'///*[@class="liuyantiaotaio"]/tr/td/font[2]/text()')
# 一级标题
primary_class = '河北省博物馆'
print(questions, len(questions))
print(replys, len(replys))
for x in range(len(questions)):
with open('河北省博物馆21-22.txt', 'a', encoding='utf-8') as file:
file.write('<REC>' + '\n')
file.write('<发布时间>=' + str('2018-06-23') + '\n')
file.write('<问答>=' + str(questions[x]) + '\n')
file.write('<回答>=' + str(replys[x]) + '\n')
file.write('<一级分类>=' + str(primary_class) + '\n')
file.write('<REC>' + '\n')
def get_more(start, end):
for one in range(start, end):
get_info(one)
time.sleep(8)
get_more(21, 22)
爬取的网页翻页是js的(构造post请求,ajax 异步刷新的, 只抓ajax调用的接口就行),然后保存固定格式
最新推荐文章于 2021-08-06 20:16:19 发布