一:爬虫的常规方法
爬虫的常用套路是table-tr(行)-th/td(元素)
'''
Created on Feb 28, 2017
@author: hcq908
'''
import csv
import os
# import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
if __name__ == '__main__':
iCntTable = 0;
html = urlopen("https://en.wikipedia.org/wiki/Comparison_of_text_editors")
#html = urlopen("http://www.shfe.com.cn/bourseService/businessdata/summaryinquiry/index.html?paramid=trading_daily")
bsObj = BeautifulSoup(html, "html.parser")
oTables = bsObj.find_all("table")#选定第一个表格
for table in oTables:
iCntTable =iCntTable + 1;
print('处理第%d个表格 \n'%iCntTable)
#获取表格名称
#sTitleTag = table.find('caption');#标题只有一个,注意有的没有标题等
#print(sTitleTag)
#