优化 oe抓取 tech where_used

本文介绍了一个采用多线程爬虫抓取汽车配件数据的案例,针对CSV存储时产生的读写冲突问题,提出将每条数据独立存储后再进行合并的方法。文章详细展示了爬虫实现的具体代码和技术细节。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

采用csv 添加的方式,由于多线程,造成读写错误。

解决方法: 将每个oe数据分开存储


import logging
import random
import threading
import urllib.parse
import urllib.parse
import requests
from queue import Queue
import pymysql
from bs4 import BeautifulSoup
import time
import  re
import csv
import json
import os
import pandas as pd
class Spider():

    def randHeader(self):
        head_connection = ['Keep-Alive', 'close']
        head_accept = ['text/html, application/xhtml+xml, */*']
        head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
        head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                           'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
                           'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
                           'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
                           'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']

        header = {
            'Connection': head_connection[0],
            'Accept': head_accept[0],
            'Accept-Language': head_accept_language[1],
            'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
        }
        return header

    def getPartNumber(self ,no):
        url = 'http://catalog.monroe.com/catalogPart/partResults.do?&domain=monroe&locale=en&partNumber=' + no + '&selection=findPart'
        # print(url)
        res = requests.get(url=url, headers=self.randHeader())
        html = res.text
        # print(html)
        if html.strip() == "null":
            print("kong")
            return "0"
        s = str(html).replace("\\xae", "")  # 转成str,替换编码174的符号为空
        sjson = re.findall("\[(.*?)\]", s)[0]
        sjson = json.loads(sjson)
        selectedPartId = sjson["value"]
        return selectedPartId

    def getTECHandWhereUsed(self,no):
        print(no)
        selectedPartId = self.getPartNumber(no)
        # print(selectedPartId)
        if selectedPartId == "0":
            df = pd.DataFrame([[no, "#", "#", "#", "#", "#", "#", "#", "#", "#", "#", "#", "#", "#"]],
                              columns=["no", "bType", "UpperMountingCode", "LowerMountingCode", "bLength", "cLength",
                                       "eLength", "tlength", "dShield", "ppack", "cBStop", "ebStop", "ehLockout",
                                       "DustShieldMaterial"])
            df.to_csv("combine\\tech_specs" + no + ".csv", index=False)


            df = pd.DataFrame([[no , "#","#","#" ]],columns=["no", "make", "model", "year"])
            df.to_csv("combine\\where_used" + no + ".csv", index=False)
            return
        url = "http://catalog.monroe.com/catalogPart/partResults.do?&selectedCatalogId=974&selectedPartId=" + str(
            selectedPartId) + "&selection=partDetails"
        # print(url)
        req = requests.get(url=url, headers=self.randHeader())
        html = req.text  # 字节编码
        s = str(html).replace("\\xae", "")  # 转成str,替换编码174的符号为空
        s = s.replace("\\\\", "\\")  # 将\\转换成\
        s = s.replace("\\'", "'")  # 将\'转换成'
        s = json.loads(s)
        att = s["part"]["attributes"]  # 定位属性
        bType=""
        UpperMountingCode=""
        LowerMountingCode=""
        bLength=""
        cLength=""
        eLength=""
        tlength=""
        dShield=""
        ppack=""
        cBStop=""
        ebStop=""
        ehLockout=""
        DustShieldMaterial=""
        for i in att:
            for key, value in i.items():  # 选择指定的属性
                key = str(key)
                if key == "Body Type":
                    bType = value
                elif key == "Upper Mounting Code":
                    UpperMountingCode = value
                elif key == "Lower Mounting Code":
                    LowerMountingCode = value
                elif key == "Body Length":
                    bLength = value
                elif key == "Compressed Length":
                    cLength = value
                elif key == "Extended Length":
                    eLength = value
                elif key == "Travel Length":
                    tlength = value
                elif key == "Dust Shield":
                    dShield = value
                elif key == "Parts Pack(s)":
                    ppack = value
                elif key == "Compressed Bumper Stop":
                    cBStop = value
                elif key == "Extended Bumper Stop":
                    ebStop = value
                elif key == "Extended Hydraulic Lockout":
                    ehLockout = value
                elif key == "Dust Shield Material":
                    DustShieldMaterial = value


        # print( bType, UpperMountingCode, LowerMountingCode, bLength, cLength, eLength, tlength,  dShield, ppack,   cBStop,  ebStop,ehLockout, DustShieldMaterial)
        df = pd.DataFrame([[no, bType, UpperMountingCode, LowerMountingCode, bLength, cLength, eLength, tlength,  dShield, ppack,   cBStop,  ebStop,ehLockout, DustShieldMaterial]] , columns=["no", "bType", "UpperMountingCode", "LowerMountingCode", "bLength", "cLength", "eLength", "tlength", "dShield", "ppack", "cBStop", "ebStop", "ehLockout", "DustShieldMaterial"])
        df.to_csv("combine\\tech_specs"+no+".csv" , index=False)
        result = []
        ve = s["vehicles"]  # 定位车型
        for i in ve:
            make = i["make"]
            modle = i["models"]
            for i in modle:
                model = i["model"]
                app = i["applications"]
                for i in app:
                    location = i["location"]
                    startYear = i["startYear"]
                    endYear = i["endYear"]
                    fitment = str(startYear) + " - " + str(endYear) + " " + str(location).upper()
                    result.append([no , make, model, fitment])
        df = pd.DataFrame(result , columns=["no","make","model","year"])
        df.to_csv("combine\\where_used"+no+".csv" , index=False)

class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类

    def __init__(self, queue):  #子类特有属性, queue
        FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[Spider]-----%(message)s------"
        logging.basicConfig(level=logging.INFO, format=FORMAT)
        threading.Thread.__init__(self)
        self.queue = queue
        self.spider = Spider()  #子类特有属性spider, 并初始化,将实例用作属性

    def run(self):
        while True:
            success = True
            item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
            self.spider.getTECHandWhereUsed(item)  # 调用实例spider的方法getDataById(item)
            logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
            self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号

class SpiderJob():

    def __init__(self , size , qs):
        self.size = size  # 将形参size的值存储到属性变量size中
        self.qs = qs

    def work(self):
        toSpiderQueue = Queue() #创建一个Queue队列对象
        for i in range(self.size):
            t = ThreadCrawl(toSpiderQueue)    #将实例用到一个类的方法中
            t.setDaemon(True)
            t.start()
        for q in self.qs:
            toSpiderQueue.put(q)  #调用队列对象的put()方法,在对尾插入一个项目item
        toSpiderQueue.join()    #队列对象,等到队列为空,再执行别的操作




# if __name__ == '__main__':
#     s = Spider()
#     s.getTECHandWhereUsed("171340R")






主函数:
from  tech_ebay import SpiderJob #从一个模块中导入类
import csv
import pandas as pd
if __name__ == '__main__':
    df = pd.read_excel("no.xlsx")
    nos = []
    for i in df.EO:
        nos.append(str(i))
    print(nos)
    Job = SpiderJob(8, nos)
    Job.work()


将分开存储数据组合函数:
import pandas as pd

df = pd.read_excel("no.xlsx")
print(df.head())
dataframe_tech_specs=[]
dataframe_where_used=[]
for i in df.EO:
    temp = pd.read_csv("combine\\tech_specs"+str(i)+".csv")
    dataframe_tech_specs.append(temp)
    temp1 =pd.read_csv("combine\\where_used"+str(i)+".csv")
    dataframe_where_used.append(temp1)
result = pd.concat(dataframe_tech_specs)
result.to_csv("tech_specs_sum.csv",index = False)
result = pd.concat(dataframe_where_used)
result.to_csv("where_used_sum.csv",index = False)



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值