pytho网络爬虫

# encoding:utf8

from urllib import request
from bs4 import BeautifulSoup
import pymysql
import time


#下载器
def htmlDownload(url):
    req = request.Request(url); #返回文件类型的对象
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2723.3 Safari/537.36')
    res = request.urlopen(req) #发送请求
    html = res.read().decode("utf8").encode("utf8")
    return html

#解析器代码
def htmlParser(html):
    soup = BeautifulSoup(html, 'html.parser', from_encoding="utf8")
    timenode = soup.find('div', class_ = 'remark')
    updatatime = timenode.get_text()[-15:] #去掉无用字符得到 '2018年03月07日 19时'
    dataNode = soup.find("table")
    tr_node = dataNode.find_all("tr")
    data = []
    for i in range(1,5):
        d = []
        td_node = tr_node[i].find_all("td")
        for j in range(0,6):
            
            if j == 2:
                alt = td_node[j].find('img')['alt']
                d.append(alt[-4:])
            else:
                d.append(td_node[j].get_text())
        data.append(d)   
    return updatatime, data

#将爬取的数据写入到数据库中
def dataWriter(updatatime, data):
    #创建链接对象
    connection = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='root', db='environment', charset='utf8')
    cursor = connection.cursor()  #创建游标
    
    for d in data:
        try:
            insert_sql = "insert into pdsenvironment(updatatime, AreaName, AQI, PollutionGrade, PM2_5, PM10, FirstItem) values('%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (updatatime, d[0], d[1], d[2], d[3], d[4], d[5])
            cursor.execute(insert_sql)  #执行sql,更新数据库
            connection.commit()   #提交,保存新建或者修改的数据
            print("%s   %s :数据已经存入数据库!"% (updatatime, d[0]))
        except Exception as e:
            print("%s   %s :该数据在数据库中已存在!"% (updatatime, d[0]))
    cursor.close()
    connection.close()
    

def getEnvironmentData():
    try:
        url = "http://www.86kongqi.com/city/pingdingshan.html"
        html = htmlDownload(url)
        updatatime, data = htmlParser(html)
        dataWriter(updatatime, data)
    except Exception as e:
        print("出现异常!一分钟后重试……")
        print("Exception: "+str(e))
        time.sleep(60)
        getEnvironmentData()
        

while True:
    print("开始工作!")
    getEnvironmentData();
    print("休息当中……")
    print("\n")
    time.sleep(3600)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值