整理link_crawler以及downloader
书中的内容是用python2.7写的,自己整理了一下,用python3.6.3中可以运行
downloader
class Downloader:
def __init__(self,delay=5,user_agent='wswp',
proxies=None,num_retries=1,cache=None):
self.throttle=Throttle(delay)
self.user_agent=user_agent
self.proxies=proxies
self.num_retries=num_retries
self.cache=cache
def __call__(self, url):
result=None
if self.cache:
try:
result=self.cache[url]
except KeyError:
pass
else:
if self.num_retries>0 and \
500<=result['code']<600:
result=None
if result is None:
#result is not loaded from cache
#so still need to download
self.throttle.wait(url)
proxy=random.choice(self.proxies) if self.proxies else None
headers={'User-agent':self.user_agent}
#print('在这里进行下载')
#每次都是在这里进行调用download进行下载。
result=self.download(url,headers,proxy,self.num_retries)
if self.cache:
#save the result to cache
self.cache[url]=result
return result['html']
def download(self,url,headers,proxy,num_retries,data=None):
print('Downloading:',url)
request=urllib.request.Request(url,data,headers or {})
opener=urllib.request.build_opener()
if proxy:
proxy_params={urllib.parse.urlparse(url).scheme:proxy}
opener.add_handler(urllib.request.ProxyHandler(proxy_params))
try:
urllib.request.install_opener(opener)
response=opener.open(request)
# response=urllib.request.urlopen(request)
html=response.read()
# print('读出html',type(response))
print(html)
code=response.code
#print('读出code',code)
except Exception as e:
print('Download error:',str(e))
html=''
if hasattr(e,'code'):
code=e.code
if num_retries>0 and 500<=code<600:
return self.download(url,headers,proxy,num_retries-1,data=None)
else:
code=None
return {'html':html,'code': code}
class Throttle:
#add a delay between downloads to the same domain
def __init__(self,delay):
#amount of delay between downloads for each domain
self.delay=delay
#timestamp of which a domain was last accessed
self.domains={}
def wait(self,url):
domain=urllib.parse.urlparse(url).netloc
last_accessed=self.domains.get(domain)
if self.delay>0 and last_accessed is not None:
sleep_secs=self.delay-(datetime.datetime.now()-last_accessed).seconds
if sleep_secs>0:
#domain has been accessed recently
#so need to sleep
time.sleep(sleep_secs)
#update the last accessed time
self.domains[domain]=datetime.datetime.now()
link_crawler
from downloader_leo import Downloader
import urllib.parse
import re
import urllib.robotparser
def link_crawler(seed_url,link_regex=None,delay=5,max_depth=2,
max_urls=-1,user_agent='wswp',proxies=None,
num_retries=1,scrape_callback=None,
cache=None):
#crawl from the given seed url following links matched by link_regex
#the queue of url's that still to be crawled
crawl_queue=[seed_url]
#the url's that have been seen and at what depth
seen={seed_url:0}
#track how many url have been download
num_urls=0
rp=get_robots(seed_url)
D=Downloader(delay=delay,user_agent=user_agent,proxies=proxies,
num_retries=num_retries,cache=cache,
)
while crawl_queue:
url=crawl_queue.pop()
depth=seen[url]#每一次循环开始时,为了避免爬虫陷阱,都会读取当前的深度,并且在seen中都会标记在当前深度下的所有连接,所以在第二次的循环时,所有的值都是2.
#check url passes robots.txt restrictions
if rp.can_fetch(user_agent,url):
html=D(url)
# f = open('HuiDiao.txt', 'wb')
# f.write(html)
# f.close()
links=[]
if scrape_callback:
links.extend(scrape_callback(url,html)or [])#scrape_callback本身并没有返回值,所以每次这个里面的links都为空
print('scrape中的links:',links)
if depth !=max_depth:
#can still crawl further
if link_regex:
#filter for links matching our regular expression
links.extend(link for link in get_links(html)
if re.match(link_regex,link))
'''
该部分实现的功能和如上是一样的只是上面更简洁,后面的normalize是把为了变成一个绝对连接,使用了urllib.parse.urljoin
crawl_queue=[seed_url]
seen=set(crawl_queue)
print('传入的网址',crawl_queue)
while crawl_queue:
url=crawl_queue.pop()
html=download(url)
print("得到的html",html)
# f = open('Get_html.txt', 'wb')
# f.write(html)
# f.close()
# print('传回的link',getlinks(html))
print('一次循环')
for link in getlinks(html):
print('返回的link',link)
if re.match(link_regex,link):
link=urllib.parse.urljoin(seed_url,link)
if link not in seen:
seen.add(link)
print('整合的link',link)
crawl_queue.append(link)
print('crawl_queue',crawl_queue)
print('每次循环后的seen',seen)
print('最终的seen:',seen)
'''
print('links', links)
for link in links:
link=normalize(seed_url,link)
#check whether already crawled this link
if link not in seen:
seen[link]=depth+1
#check link is within same domain
if same_domain(seed_url,link):
#success!add this new link to queue
crawl_queue.append(link)
print('seen_link:',seen)
#check whether have reached downloaded maximum
num_urls+=1
if num_urls==max_urls:
break
else:
print('Blocked by robots.txt:',url)
def normalize(seed_url,link):
# print('进行到这一步')
link=urllib.parse.urldefrag(link)
# print('进行到这11步')
# print('seed_url:',seed_url,'type:',type(seed_url),'link type',type(link),link.url)
link=urllib.parse.urljoin(seed_url,link.url)
return link
def same_domain(url1,url2):
#return True if both url is belong to same domain
return urllib.parse.urlparse(url1).netloc==urllib.parse.urlparse(url2).netloc
def get_robots(url):
#initialize robots parser for this domain
rp=urllib.robotparser.RobotFileParser()
rp.set_url(urllib.parse.urljoin(url,'/robots.txt'))
rp.read()
return rp
def get_links(html):
#return a list of links from html
#a regular expression to extract all links from the webpage
webpage_regex=re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
# #list of all links from the webpage
# return webpage_regex.findall(html.decode())
# webpage_regex=re.compile('<a[^>]+href=["\'].*?(/view.*?|/index.*?)["\']',re.IGNORECASE)
return webpage_regex.findall(html.decode())
def main():
rp=get_robots('http://example.webscraping.com')
print(rp.can_fetch('BadCrawler','http://example.webscraping.com'))
if __name__=='__main__':
main()
scrape_callback
import csv
import re
import urllib.parse
import lxml.html
from link_crawler_leo import link_crawler
class ScrapeCallback:
def __init__(self):
self.writer = csv.writer(open('countries.csv', 'w'))
self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
self.writer.writerow(self.fields)
def __call__(self, url, html):
if re.search('/view/', url):
tree = lxml.html.fromstring(html)
# f = open('call_back_html.txt', 'wb')
# f.write(html)
# f.close()
row = []
for field in self.fields:
row.append(tree.cssselect('table>tr#places_{}__row>td.w2p_fw'.format(field))[0].text_content())
self.writer.writerow(row)
def main():
link_crawler(seed_url='http://example.webscraping.com', link_regex='/places/default/(index|view)', scrape_callback=ScrapeCallback())
if __name__ == '__main__':
main()
disk_cache
import pickle
from datetime import datetime,timedelta
import os
import zlib
import urllib.parse
import re
import shutil
from Crawler_learn.lec03.link_crawler_leo import link_crawler
class DiskCache():
def __init__(self,cache_dir='cache',expires=timedelta(days=30),compress=True):
'''
:param self:
:param cache_dir: the root level folder for the cache
:param expires: timedelta of amount of time before a cache entry is considered expired
:param compress: whether to compress data in the cache
:return:
'''
self.cache_dir=cache_dir
self.expires=expires
self.compress=compress
def __getitem__(self,url):
#load data from disk for this url
print('执行__getitem__')
path=self.url_to_path(url)
if os.path.exists(path):
with open(path,'rb') as fp:
data=fp.read()
if self.compress:
data=zlib.decompress(data)
result,timestamp=pickle.loads(data)
if self.has_expired(timestamp):
raise KeyError(url+'has expired')
return result
else:
#url has not been cached
raise KeyError(url+'does not exist')
def __setitem__(self,url,result):
#save the data to disk for this url
print('执行__setitem__')
print('url:',url)
path =self.url_to_path(url)
print('path:',path)
folder=os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
data=pickle.dumps((result,datetime.utcnow()))
if self.compress:
data=zlib.compress(data)
with open(path,'wb') as fp:
fp.write(data)
def __delitem__(self,url):
path=self.url_to_path(url)
try:
os.remove(path)
os.removedirs(os.path.dirname(path))
except Exception:
pass
def url_to_path(self,url):
#create file system path fo r this url
components=urllib.parse.urlsplit(url)
path=components.path
if not path:
path='/index.html'
elif path.endswith('/'):
path+='index.html'
elif path=='/places/default/index':
path+='/index.html'
filename=components.netloc+path+components.query
#replace invalid characters
filename=re.sub('[^/0-9a-zA-Z\-.,;_]','_',filename)
#restrict maxium number of characters
filename='/'.join(seg[:255] for seg in filename.split('/'))
return os.path.join(self.cache_dir,filename)
def has_expired(self,timestamp):
return datetime.utcnow()>timestamp+self.expires
def clear(self):
#remove all the cache values
if os.path.exists(self.cache_dir):
shutil.rmtree(self.cache_dir)
def main():
link_crawler('http://example.webscraping.com/', '/places/default/(index|view)', cache=DiskCache())
if __name__=='__main__':
main()
MongoCache
import pickle
import zlib
from datetime import datetime,timedelta
from pymongo import MongoClient
from bson.binary import Binary
class MongoCache():
def __init__(self,client=None,expires=timedelta(days=30)):
""""
client=mongo database client
expires;timedelta of amount of time before a cache
entry is considered expired
"""
# if a client object is not passed
#then try to connectingto mongodb at the default localhost port
self.client=MongoClient('localhost',27017) if client is None else client
#create collection to store cached webpages
#which is the equvalent of a table in a relational database
self.db=self.client.cache
id=self.db.webpage.index_information()
print(id)
#这里添加了判断id的程序,因为,在main中一旦修改了时间,程序就会报错因为之前已经添加了,不能再修改。
#所以在每次开始之前先用drop_index删除,然后再重新添加。
if 'timestamp_1'in id:
self.db.webpage.drop_index('timestamp_1')
print(self.db.webpage.index_information())
self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds())
print(self.db.webpage.index_information())
# self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds())
def __contains__(self, url):
try:
self[url]
except KeyError:
return False
else:
return True
def __getitem__(self, url):
"""
load value from this url
"""
record=self.db.webpage.find_one({'_id':url})
if record:
#return record['result']
return pickle.loads(zlib.decompress(record['result']))
else:
raise KeyError(url+'does not exist')
def __setitem__(self, url, result):
""""
save value for this url
record={"result":result,'timestamp':datetime.utcnow()}
"""
print('set item')
record={'result':Binary(zlib.compress(pickle.dumps(result))),'1timestamp':datetime.utcnow()}
self.db.webpage.update({'_id':url},{'$set':record},upsert=True)
def clear(self):
self.db.webpage.delete()
def main():
cache=MongoCache(expires=timedelta(seconds=35))
result={"html":'hello world'}
url='www.we99bscraping.com'
cache[url]=result
print(cache[url])
if __name__=='__main__':
main()