自学了一段时间python,检验一下成果:
python版本3.7
最终将爬取的数据插入数据库:
import time
import pymysql
import requests
from bs4 import BeautifulSoup
#连接数据库
from requests import RequestException
con = pymysql.connect(host="xxx.xx.xx.xx", user="root", password="xxx123", database="test", charset="utf8")
cursor = con.cursor()
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
url='http://datacenter.mee.gov.cn/aqiweb2/'
def getAQIData(url):
try:
#print(url)
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
print(url + ',visit error')
return None
except RequestException:
print('请求异常')
return None
def edit_html(html):
soup=BeautifulSoup(html,"html.parser")
soup.prettify()
pushTime=soup.find_all(id="hour")[0]['value']
nickname = soup.find(id="legend_01_table").find_all('tr')
tableData=soup.findAll('table')[0];
for tragg in soup.findAll('tr'):
tdarray=[];
for tdagg in tragg.find_all('td'):
tdval=get_tdagg(tdagg)
tdarray.append(tdval)
save_mysql_js(tdarray)
def get_tdagg(html):
return str(html.get_text())
def save_mysql_js(i):
if len(i) > 4:
sql = "insert into tb_js(zz,title,intro,url,content) values(%s, %s, %s, %s, %s);"
if i[0] == '北京市':
print(i[0] + ';' + i[1] + ';' + i[2] + ';' + i[3]+';'+i[4])
print(cursor.execute(sql, [str(i[0]), str(i[1]), str(i[2]), str(i[3]), str(i[4])]))
def time_Run():
html=getAQIData(url)
if html is not None:
try:
# 执行sql语句
edit_html(html);
# 提交到数据库执行
con.commit()
except:
# Rollback in case there is any error
con.rollback()
# 关闭数据库连接
#cursor.close()
#con.close()
def timer(n):
while True:
print('====定时开始======')
time.sleep(n)
time_Run()
#5s
timer(5)