给你代码你看一下,再决定,urils.py的import time
import requests
import json
import math
import random
import typing
class XHS_Splier_utils():
def __init__(self):
self.URL="https://edith.xiaohongshu.com"
self.JS_DOM_APTH=r"D:\lws_tanchen\project\tanchen_py_dy\tanchen_data_analysis\core\Crawler\xhs_crawler\node_modules\jsdom\lib"
def search_generate_x_b3_traceid(self,len=16):
x_b3_traceid = ""
for t in range(len):
x_b3_traceid += "abcdef0123456789"[math.floor(16 * random.random())]
return x_b3_traceid
def search_trans_cookies(self,cookies_str:str):
if '; ' in cookies_str:
ck = {i.split('=')[0]: '='.join(i.split('=')[1:]) for i in cookies_str.split('; ')}
else:
ck = {i.split('=')[0]: '='.join(i.split('=')[1:]) for i in cookies_str.split(';')}
return ck
def search_generate_xs_xs_common(self,a1, api, data=''):
import execjs
"""
execjs 是一个 允许在 Python 环境中执行 JavaScript 代码
pip install PyExecJS==1.5.1
"""
try:
with open('./static/xhs_xs_xsc_56.js', 'r', encoding='utf-8') as f:
js_code = f.read()
js = execjs.compile(js_code, cwd=self.JS_DOM_APTH)
except Exception as e:
with open('./static/xhs_xs_xsc_56.js', 'r', encoding='utf-8') as f:
js_code = f.read()
js = execjs.compile(js_code, cwd=self.JS_DOM_APTH)
ret = js.call('get_request_headers_params', api, data, a1)
xs, xt, xs_common = ret['xs'], ret['xt'], ret['xs_common']
return xs, xt, xs_common
def search_generate_xray_traceid(self):
import execjs
try:
# 读取 JS 文件
with open('./static/xhs_xray.js', 'r', encoding='utf-8') as f:
js_code = f.read()
xray_js = execjs.compile(js_code, cwd=self.JS_DOM_APTH)
except Exception as e:
with open('../static/xhs_xray.js', 'r', encoding='utf-8') as f:
js_code = f.read()
xray_js = execjs.compile(js_code, cwd=self.JS_DOM_APTH)
return xray_js.call('traceId')
def search_get_request_headers_template(self):
return {
"authority": "edith.xiaohongshu.com",
"accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"cache-control": "no-cache",
"content-type": "application/json;charset=UTF-8",
"origin": "https://www.xiaohongshu.com",
"pragma": "no-cache",
"referer": "https://www.xiaohongshu.com/",
"sec-ch-ua": "\"Not A(Brand\";v=\"99\", \"Microsoft Edge\";v=\"121\", \"Chromium\";v=\"121\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
"x-b3-traceid": "",
"x-mns": "unload",
"x-s": "",
"x-s-common": "",
"x-t": "",
"x-xray-traceid": self.search_generate_xray_traceid()
}
def search_generate_headers(self,a1, api, data):
xs, xt, xs_common = self.search_generate_xs_xs_common(a1, api, data)
x_b3_traceid = self.search_generate_x_b3_traceid()
headers = self.search_get_request_headers_template()
headers['x-s'] = xs
headers['x-t'] = str(xt)
headers['x-s-common'] = xs_common
headers['x-b3-traceid'] = x_b3_traceid
if data:
data = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return headers, data
def search_generate_request_params(self,cookies_str:str,api:str,data):
cookies = self.search_trans_cookies(cookies_str)
a1 = cookies['a1']
headers, data = self.search_generate_headers(a1, api, data)
return headers, cookies, data
def single_search_api(self,query:str,cookies_str:str,page:int=1,sort_type_choice:int=0,note_type:int=0,note_time:int=0,note_range:int=0,pos_distance:int=0,geo:str=None,proxies: dict = None):
"""
获取搜索笔记的结果
:param query 搜索的关键词
:param cookies_str 你的cookies
:param page 搜索的页数
:param sort_type_choice 排序方式 0 综合排序, 1 最新, 2 最多点赞, 3 最多评论, 4 最多收藏
:param note_type 笔记类型 0 不限, 1 视频笔记, 2 普通笔记
:param note_time 笔记时间 0 不限, 1 一天内, 2 一周内天, 3 半年内
:param note_range 笔记范围 0 不限, 1 已看过, 2 未看过, 3 已关注
:param pos_distance 位置距离 0 不限, 1 同城, 2 附近 指定这个必须要指定 geo
:param geo = "{
# # 经纬度
# "latitude": 39.9725,
# "longitude": 116.4207
# }" str类型
:param proxies={
"http":"",
"https":""
}dict类型
返回搜索的结果
"""
res_json = None
sort_type = "general"
if sort_type_choice == 1:
sort_type = "time_descending"
elif sort_type_choice == 2:
sort_type = "popularity_descending"
elif sort_type_choice == 3:
sort_type = "comment_descending"
elif sort_type_choice == 4:
sort_type = "collect_descending"
filter_note_type = "不限"
if note_type == 1:
filter_note_type = "视频笔记"
elif note_type == 2:
filter_note_type = "普通笔记"
filter_note_time = "不限"
if note_time == 1:
filter_note_time = "一天内"
elif note_time == 2:
filter_note_time = "一周内"
elif note_time == 3:
filter_note_time = "半年内"
filter_note_range = "不限"
if note_range == 1:
filter_note_range = "已看过"
elif note_range == 2:
filter_note_range = "未看过"
elif note_range == 3:
filter_note_range = "已关注"
filter_pos_distance = "不限"
if pos_distance == 1:
filter_pos_distance = "同城"
elif pos_distance == 2:
filter_pos_distance = "附近"
if geo:
geo = json.dumps(geo, separators=(',', ':'))
try:
"""
query = "榴莲"
query_num = 10
sort_type_choice = 0 # 0 综合排序, 1 最新, 2 最多点赞, 3 最多评论, 4 最多收藏
note_type = 0 # 0 不限, 1 视频笔记, 2 普通笔记
note_time = 0 # 0 不限, 1 一天内, 2 一周内天, 3 半年内
note_range = 0 # 0 不限, 1 已看过, 2 未看过, 3 已关注
pos_distance = 0 # 0 不限, 1 同城, 2 附近 指定这个1或2必须要指定 geo
# geo = {
# # 经纬度
# "latitude": 39.9725,
# "longitude": 116.4207
# }
"""
api="/api/sns/web/v1/search/notes"
data = {
"keyword": query,
"page": page,
"page_size": 20,
"search_id": self.search_generate_x_b3_traceid(21),
"sort": "general",
"note_type": 0,
"ext_flags": [],
"filters": [
{
"tags": [
sort_type
],
"type": "sort_type"
},
{
"tags": [
filter_note_type
],
"type": "filter_note_type"
},
{
"tags": [
filter_note_time
],
"type": "filter_note_time"
},
{
"tags": [
filter_note_range
],
"type": "filter_note_range"
},
{
"tags": [
filter_pos_distance
],
"type": "filter_pos_distance"
}
],
"geo": geo,
"image_formats": [
"jpg",
"webp",
"avif"
]
}
headers, cookies, data = self.search_generate_request_params(cookies_str,api,data)
response=requests.post(url=self.URL+api,headers=headers,data=data.encode("utf-8"),cookies=cookies,proxies=proxies)
res_json=response.json()
success,msg=res_json["success"],res_json["msg"]
except Exception as e:
success=False
msg=str(e)
return success,msg,res_json
def all_search_api(self, query: str, cookies_str: str, require_num: int=500,sort_type_choice=0, note_type=0, note_time=0, note_range=0, pos_distance=0, geo="", proxies: dict = None):
"""
指定数量搜索笔记,设置排序方式和笔记类型和笔记数量
PC端爬虫最多也就200多条,默认设置获取500条
设置一个随机延迟时间,不要搞那么快,可以手动调节范围,或者取消
:param query 搜索的关键词
:param require_num 搜索的数量默认500条
:param cookies_str 你的cookies
:param sort_type_choice 排序方式 0 综合排序, 1 最新, 2 最多点赞, 3 最多评论, 4 最多收藏
:param note_type 笔记类型 0 不限, 1 视频笔记, 2 普通笔记
:param note_time 笔记时间 0 不限, 1 一天内, 2 一周内天, 3 半年内
:param note_range 笔记范围 0 不限, 1 已看过, 2 未看过, 3 已关注
:param pos_distance 位置距离 0 不限, 1 同城, 2 附近 指定这个必须要指定 geo
:param geo: 定位信息 经纬度
返回搜索的结果
"""
page = 1
note_list = []
try:
while True:
success, msg, res_json = self.single_search_api(query, cookies_str, page, sort_type_choice, note_type,
note_time, note_range, pos_distance, geo, proxies)
if not success:
raise Exception(msg)
if "items" not in res_json["data"]:
break
notes = res_json["data"]["items"]
note_list.extend(notes)
page += 1
if len(note_list) >= require_num or not res_json["data"]["has_more"]:
break
print("note_list",note_list)
#time.sleep(random.randint(1,5))
except Exception as e:
success = False
msg = str(e)
if len(note_list) > require_num:
note_list = note_list[:require_num]
return success, msg, note_list
COOKIES='abRequestId=0ce8e19f-c357-5b21-a435-349316627ab2; a1=19777a5b680yiog7y6vwx1erczenf5aup16obr46u50000367007; webId=167052dbe181764057d527bed85d8ae8; gid=yjWWW02SYJU2yjWWW02DKyF4Y8v3lkWvKhECKK7ydFSudl28Ii207D888qKW88W80yDD88y2; xsecappid=xhs-pc-web; webBuild=4.72.0; web_session=040069b24a794e37836707ee413a4b6f92710c; unread={%22ub%22:%22686e4bfe000000000b01d57c%22%2C%22ue%22:%226862b458000000001c034d32%22%2C%22uc%22:27}; acw_tc=0ad583f417524771866795359e8c06b0dd25f3e6eb69b5f2e090ce575d9b27; websectiga=3633fe24d49c7dd0eb923edc8205740f10fdb18b25d424d2a2322c6196d2a4ad; sec_poison_id=18be10d8-a62f-4f6b-a4cf-3c734e2af7d0; loadts=1752477207049'
# success, msg, notes=XHS_Splier_utils().single_search_api(query="榴莲",cookies_str=COOKIES)
# print(notes)
success, msg, notes=XHS_Splier_utils().all_search_api(query="榴莲",cookies_str=COOKIES)
print("notes",notes)
print("长度",len(notes)),和xhs_creator_xs.js的代码片段const crypto = require('crypto');
let key = 'glt6h61ta7kisow7'
let iv = '4hrivgw5s342f9b2'
key = Buffer.from(key);
iv = Buffer.from(iv);下面都是函数调用,xhs_xray.js的self = global;
window = global;
var zc666;下面也都是函数调用,xhs_xray_pack1.js的内容(self.webpackChunkxhs_pc_web = self.webpackChunkxhs_pc_web || []).push([[861], {}),xhs_xray_pack2.js的内容(self.webpackChunkxhs_pc_web = self.webpackChunkxhs_pc_web || []).push([[121], {}),和xhs_xs_xsc_56.js的const jsdom = require("jsdom");
const { JSDOM } = jsdom;
const html = "<!DOCTYPE html><p></p>";
const resourceLoader = new jsdom.ResourceLoader({
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
});
const dom = new JSDOM(html,{
url: "https://www.xiaohongshu.com",
referrer: "https://www.xiaohongshu.com",
contentType: "text/html",
resources: resourceLoader,
})
window = dom.window;
document = window.document;
// DOMParser = window.DOMParser;
// location = window.location;
// navigator = window.navigator;
// localStorage = window.localStorage;
// class AudioContextMock {
// constructor() {
// }
// }
// class webkitAudioContextMock {
// constructor() {
// }
// }
// var indexedDB = {}
// var canvas = {
// toDataURL: function toDataURL() {
// },
// getContext: function getContext(x) {
// }
// };
window.document.cookie = "a1=1927f6098768njq4co9jqukn0qtc8irx7u3ixrnxs50000565146;"
// history = {}
// Image = function () {
//
// }
// PluginArray = function () {
//
// }
// indexedDB = function () {
//
// }
// WebSocket = function () {
//
// }
var esm_typeof = {
Z: function (t) {
return typeof t;
},
};var r=[],还有个node_modules下面有个jsdom文件夹
最新发布