《用Python写网络爬虫》第三章程序（link_crawler，downloader，MongoCache，scrape_callback，DiskCache）

整理link_crawler以及downloader

书中的内容是用python2.7写的，自己整理了一下，用python3.6.3中可以运行

downloader

 class Downloader:
    def __init__(self,delay=5,user_agent='wswp',
                 proxies=None,num_retries=1,cache=None):
        self.throttle=Throttle(delay)
        self.user_agent=user_agent
        self.proxies=proxies
        self.num_retries=num_retries
        self.cache=cache

    def __call__(self, url):
        result=None
        if self.cache:
            try:
                result=self.cache[url]
            except KeyError:
                pass
            else:
                if self.num_retries>0 and \
                    500<=result['code']<600:
                    result=None
        if result is None:
            #result is not loaded from cache
            #so still need to download
            self.throttle.wait(url)
            proxy=random.choice(self.proxies) if self.proxies else None
            headers={'User-agent':self.user_agent}
            #print('在这里进行下载')
            #每次都是在这里进行调用download进行下载。
            result=self.download(url,headers,proxy,self.num_retries)
            if self.cache:
                #save the result to cache
                self.cache[url]=result
        return result['html']


    def download(self,url,headers,proxy,num_retries,data=None):
        print('Downloading:',url)
        request=urllib.request.Request(url,data,headers or {})

        opener=urllib.request.build_opener()
        if proxy:
            proxy_params={urllib.parse.urlparse(url).scheme:proxy}
            opener.add_handler(urllib.request.ProxyHandler(proxy_params))
        try:
            urllib.request.install_opener(opener)
            response=opener.open(request)
            # response=urllib.request.urlopen(request)
            html=response.read()
            # print('读出html',type(response))
            print(html)
            code=response.code
            #print('读出code',code)
        except Exception as e:
            print('Download error:',str(e))
            html=''
            if hasattr(e,'code'):
                code=e.code
                if num_retries>0 and 500<=code<600:
                    return self.download(url,headers,proxy,num_retries-1,data=None)
                else:
                    code=None
        return {'html':html,'code': code}
class Throttle:
    #add a delay between downloads to the same domain
    def __init__(self,delay):
        #amount of delay between downloads for each domain
        self.delay=delay
        #timestamp of which a domain was last accessed
        self.domains={}
    def wait(self,url):
        domain=urllib.parse.urlparse(url).netloc
        last_accessed=self.domains.get(domain)
        if self.delay>0 and last_accessed is not None:
            sleep_secs=self.delay-(datetime.datetime.now()-last_accessed).seconds
            if sleep_secs>0:
                #domain has been accessed recently
                #so need to sleep
                time.sleep(sleep_secs)
        #update the last accessed time
        self.domains[domain]=datetime.datetime.now()

link_crawler

 from downloader_leo import Downloader
import urllib.parse
import re
import urllib.robotparser


def link_crawler(seed_url,link_regex=None,delay=5,max_depth=2,
                 max_urls=-1,user_agent='wswp',proxies=None,
                 num_retries=1,scrape_callback=None,
                 cache=None):
    #crawl from the given seed url following links matched by link_regex
    #the queue of url's that still to be crawled
    crawl_queue=[seed_url]
    #the url's that have been seen and at what depth
    seen={seed_url:0}
    #track how many url have been download
    num_urls=0
    rp=get_robots(seed_url)
    D=Downloader(delay=delay,user_agent=user_agent,proxies=proxies,
                 num_retries=num_retries,cache=cache,
                 )
    while crawl_queue:
        url=crawl_queue.pop()
        depth=seen[url]#每一次循环开始时，为了避免爬虫陷阱，都会读取当前的深度，并且在seen中都会标记在当前深度下的所有连接，所以在第二次的循环时，所有的值都是2.
        #check url passes robots.txt restrictions
        if rp.can_fetch(user_agent,url):

            html=D(url)
            # f = open('HuiDiao.txt', 'wb')
            # f.write(html)
            # f.close()
            links=[]
            if scrape_callback:
                links.extend(scrape_callback(url,html)or [])#scrape_callback本身并没有返回值，所以每次这个里面的links都为空
                print('scrape中的links:',links)
            if depth !=max_depth:
                #can still crawl further
                if link_regex:
                    #filter for links matching our regular expression
                    links.extend(link for link in get_links(html)
                                 if re.match(link_regex,link))
                                 '''
                                 该部分实现的功能和如上是一样的只是上面更简洁，后面的normalize是把为了变成一个绝对连接，使用了urllib.parse.urljoin
                                 crawl_queue=[seed_url]
							    seen=set(crawl_queue)
							    print('传入的网址',crawl_queue)
							    while crawl_queue:
							        url=crawl_queue.pop()
							        html=download(url)
							        print("得到的html",html)
							        # f = open('Get_html.txt', 'wb')
							        # f.write(html)
							        # f.close()
							        # print('传回的link',getlinks(html))
							        print('一次循环')
							        for link in getlinks(html):
							            print('返回的link',link)
							            if re.match(link_regex,link):
							                link=urllib.parse.urljoin(seed_url,link)
							                if link not in seen:
							                    seen.add(link)
							                    print('整合的link',link)
							                    crawl_queue.append(link)
							                    print('crawl_queue',crawl_queue)
							        print('每次循环后的seen',seen)
							    print('最终的seen：',seen)
                                 '''
                    print('links', links)
                for link in links:
                    link=normalize(seed_url,link)
                    #check whether already crawled this link
                    if link not in seen:
                        seen[link]=depth+1
                        #check link is within same domain
                        if same_domain(seed_url,link):
                            #success!add this new link to queue
                            crawl_queue.append(link)
                print('seen_link:',seen)
            #check whether have reached downloaded maximum
            num_urls+=1
            if num_urls==max_urls:
                break
        else:
            print('Blocked by robots.txt:',url)



def normalize(seed_url,link):
    # print('进行到这一步')
    link=urllib.parse.urldefrag(link)
    # print('进行到这11步')
    # print('seed_url:',seed_url,'type:',type(seed_url),'link type',type(link),link.url)
    link=urllib.parse.urljoin(seed_url,link.url)
    return link

def same_domain(url1,url2):
    #return True if both url is belong to same domain
    return urllib.parse.urlparse(url1).netloc==urllib.parse.urlparse(url2).netloc

def get_robots(url):
    #initialize robots parser for this domain
    rp=urllib.robotparser.RobotFileParser()
    rp.set_url(urllib.parse.urljoin(url,'/robots.txt'))
    rp.read()
    return rp

def get_links(html):
    #return a list of links from html
    #a regular expression to extract all links from the webpage
    webpage_regex=re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
    # #list of all links from the webpage
    # return webpage_regex.findall(html.decode())

    # webpage_regex=re.compile('<a[^>]+href=["\'].*?(/view.*?|/index.*?)["\']',re.IGNORECASE)
    return webpage_regex.findall(html.decode())


def main():
    rp=get_robots('http://example.webscraping.com')
    print(rp.can_fetch('BadCrawler','http://example.webscraping.com'))
if __name__=='__main__':
    main()

scrape_callback

import csv
import re
import urllib.parse
import lxml.html
from link_crawler_leo import link_crawler



class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open('countries.csv', 'w'))
        self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        if re.search('/view/', url):
            tree = lxml.html.fromstring(html)
            # f = open('call_back_html.txt', 'wb')
            # f.write(html)
            # f.close()
            row = []
            for field in self.fields:
                row.append(tree.cssselect('table>tr#places_{}__row>td.w2p_fw'.format(field))[0].text_content())
            self.writer.writerow(row)
def main():
    link_crawler(seed_url='http://example.webscraping.com', link_regex='/places/default/(index|view)', scrape_callback=ScrapeCallback())



if __name__ == '__main__':
    main()

disk_cache

import pickle
from datetime import datetime,timedelta
import os
import zlib
import urllib.parse
import re
import shutil
from Crawler_learn.lec03.link_crawler_leo import link_crawler

class DiskCache():

    def __init__(self,cache_dir='cache',expires=timedelta(days=30),compress=True):
        '''

        :param self:
        :param cache_dir: the root level folder for the cache
        :param expires: timedelta of amount of time before a cache entry is considered expired
        :param compress: whether to compress data in the cache
        :return:
        '''
        self.cache_dir=cache_dir
        self.expires=expires
        self.compress=compress

    def __getitem__(self,url):
        #load data from disk for this url
        print('执行__getitem__')
        path=self.url_to_path(url)
        if os.path.exists(path):
            with open(path,'rb') as fp:
                data=fp.read()
                if self.compress:
                    data=zlib.decompress(data)
                result,timestamp=pickle.loads(data)
                if self.has_expired(timestamp):
                    raise KeyError(url+'has expired')
                return result
        else:
            #url has not been cached
            raise KeyError(url+'does not exist')

    def __setitem__(self,url,result):
        #save the data to disk for this url
        print('执行__setitem__')
        print('url:',url)
        path =self.url_to_path(url)
        print('path:',path)
        folder=os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)

        data=pickle.dumps((result,datetime.utcnow()))
        if self.compress:
            data=zlib.compress(data)
        with open(path,'wb') as fp:
            fp.write(data)

    def __delitem__(self,url):
        path=self.url_to_path(url)
        try:
            os.remove(path)
            os.removedirs(os.path.dirname(path))
        except Exception:
            pass

    def url_to_path(self,url):
        #create file system path fo r this url
        components=urllib.parse.urlsplit(url)
        path=components.path
        if not path:
            path='/index.html'
        elif path.endswith('/'):
            path+='index.html'
        elif path=='/places/default/index':
            path+='/index.html'
        filename=components.netloc+path+components.query
        #replace invalid characters
        filename=re.sub('[^/0-9a-zA-Z\-.,;_]','_',filename)
        #restrict maxium number of characters
        filename='/'.join(seg[:255] for seg in filename.split('/'))
        return os.path.join(self.cache_dir,filename)

    def has_expired(self,timestamp):
        return datetime.utcnow()>timestamp+self.expires


    def clear(self):
        #remove all the cache values
        if os.path.exists(self.cache_dir):
            shutil.rmtree(self.cache_dir)


def main():
        link_crawler('http://example.webscraping.com/', '/places/default/(index|view)', cache=DiskCache())


if __name__=='__main__':
        main()

MongoCache

import pickle
import zlib
from datetime import datetime,timedelta
from pymongo import MongoClient
from bson.binary import Binary

class MongoCache():



    def __init__(self,client=None,expires=timedelta(days=30)):
        """"
        client=mongo database client
        expires；timedelta of amount of time before a cache
        entry is considered expired
        """
        # if a client object is not passed
        #then try to connectingto mongodb at the default localhost port
        self.client=MongoClient('localhost',27017) if client is None else client
        #create collection to store cached webpages
        #which is the equvalent of a table in a relational database
        self.db=self.client.cache
        id=self.db.webpage.index_information()
        print(id)
        #这里添加了判断id的程序，因为，在main中一旦修改了时间，程序就会报错因为之前已经添加了，不能再修改。
        #所以在每次开始之前先用drop_index删除，然后再重新添加。
        if 'timestamp_1'in id:
            self.db.webpage.drop_index('timestamp_1')
        print(self.db.webpage.index_information())
        self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds())
        print(self.db.webpage.index_information())
        # self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds())

    def __contains__(self, url):
        try:
            self[url]
        except KeyError:
            return False
        else:
            return True
    def __getitem__(self, url):
        """
        load value from this url
        """
        record=self.db.webpage.find_one({'_id':url})
        if record:
            #return record['result']
            return pickle.loads(zlib.decompress(record['result']))
        else:
            raise KeyError(url+'does not exist')

    def __setitem__(self, url, result):
        """"
        save value for this url
        record={"result":result,'timestamp':datetime.utcnow()}
        """
        print('set item')
        record={'result':Binary(zlib.compress(pickle.dumps(result))),'1timestamp':datetime.utcnow()}

        self.db.webpage.update({'_id':url},{'$set':record},upsert=True)
    def clear(self):
        self.db.webpage.delete()


def main():
    cache=MongoCache(expires=timedelta(seconds=35))
    result={"html":'hello world'}
    url='www.we99bscraping.com'
    cache[url]=result
    print(cache[url])



if __name__=='__main__':
    main()