python 爬取中国天气网（济南）数据

最新推荐文章于 2024-08-10 17:53:20 发布

码农-Python高工

最新推荐文章于 2024-08-10 17:53:20 发布

阅读量723

点赞数

分类专栏： python基础

本文链接：https://blog.csdn.net/weixin_44541001/article/details/104027333

版权

python基础专栏收录该内容

32 篇文章 0 订阅

订阅专栏

图片位置的数据，具体数据可以页面右击，查看网页源代码，里边有这些数据，正则提出来

 dd = re.findall(r'{"od":{"od0".*', html)[0][:-1]，集体看代码

在这里插入图片描述

# 济南天气
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import datetime
import json
import time
import random

import requests
import re
from lxml import etree
import pymysql
from DBUtils.PooledDB import PooledDB


class Weather_china:
    headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"},
               {"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"},
               {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"},
               ]

    #
    def __init__(self):
        self.host = '127.0.0.1'
        self.pool = PooledDB(creator=pymysql, maxcached=5, maxshared=5, host=self.host, user='root',
                             passwd='123', db="test01", port=3306,
                             charset="utf8")
        self.conn = self.pool.connection()
        self.cursor = self.conn.cursor()

    def parseUrl(self, url):
        header1 = random.choice(self.headers)
        res = requests.get(url, headers=header1, verify=False)
        res.encoding = "utf-8"
        html = res.text
        # print(html)
        dd = re.findall(r'{"od":{"od0".*', html)[0][:-1]
        datas = json.loads(dd)
        # print(datas)
        # 当前城市 -----------
        parse = etree.HTML(html)
        now_city = parse.xpath('//div[@class="crumbs fl"]//text()')
        city = ''
        for i in now_city:
            city += i.strip()
        print(city)
        od0 = datas["od"]["od0"][:-4]
        year = od0[0:4]
        mouth = od0[4:6]
        day = od0[6:8]
        hour = int(od0[8:10])

        data1 = datas['od']['od2'][:-1]
        # print(data1)
        # 当前时间---------------
        now_time = ''
        L = []
        for data in data1:
            # 整理时间输出
            if int(data['od21']) <= hour:
                now_time = year + '-' + mouth + '-' + str(int(day)) + ' ' + data['od21']
            else:
                now_time = year + '-' + mouth + '-' + str(int(day) - 1) + ' ' + data['od21']

            """
            空气质量  今天没有
            {"od21":"16","od22":"29","od23":"61","od24":"东北风",
            "od25":"1","od26":"0.0","od27":"53","od28":"90"}
            od21  当前小时    now_time
            od22  温度        temperature
            od23  风向        wind_direction
            od25  风力        wind_power
            od26  降水量 0.0  precipitation
            od27  相对湿度    relative_humidity
            od28: 空气质量    air_quality
            city  城市
            """
            temperature = data['od22']
            wind_direction = data['od23']
            wind_power = data['od25']
            precipitation = data['od26']
            relative_humidity = data['od27']
            air_quality = data['od28']

            # 判断数据存在与否
            sql = "select * from weather_china where now_time='%s'and city='%s'" % (now_time, city)
            number = self.cursor.execute(sql)
            if number == 1:

                try:
                    command_a = "update weather_china set air_quality='%s' where now_time='%s'and city='%s'" % (air_quality, now_time,city)
                    # 使用execute方法执行SQL语句
                    self.cursor.execute(command_a)
                    # 提交到数据库执行
                    self.conn.commit()
                    print("更新成功")
                except Exception as e:
                    self.conn.rollback()
                    print("更新失败")
            elif number == 0:

                L.append((now_time, city, temperature, wind_direction, wind_power, precipitation, relative_humidity,
                          air_quality))

        # print(L)
        self.toMysql(L)

    def toMysql(self, L):
        if L:
            try:
                sql = """INSERT INTO weather_china(now_time, city, temperature, wind_direction, wind_power, precipitation, relative_humidity,
                          air_quality) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"""
                self.cursor.executemany(sql, L)
                self.conn.commit()
                print("写入成功")
                del L[:]
            except Exception as e:
                self.conn.rollback()
                print(e)
        else:
            pass

    def workOn(self):

        urls =['http://www.weather.com.cn/weather1d/101120101.shtml#around2', # 济南城区
               'http://www.weather.com.cn/weather1d/101121601.shtml#around2', # # 莱芜
               'http://www.weather.com.cn/weather1d/101120102.shtml#around2',# # 长清
               'http://www.weather.com.cn/weather1d/101120107.shtml#input', # # 历下
               'http://www.weather.com.cn/weather1d/101120111.shtml#input', # # 历城
               'http://www.weather.com.cn/weather1d/101120109.shtml#input',# # 槐荫
               'http://www.weather.com.cn/weather1d/101120110.shtml#input',# # 天桥
               'http://www.weather.com.cn/weather1d/101120108.shtml#input',# # 市中
               'http://www.weather.com.cn/weather1d/101120104.shtml#input',# # 章丘
               'http://www.weather.com.cn/weather1d/101120106.shtml#input', # # 济阳
               'http://www.weather.com.cn/weather1d/101121603.shtml#input',# # 钢城
               'http://www.weather.com.cn/weather1d/101120103.shtml#input',# # 商河
               'http://www.weather.com.cn/weather1d/101120105.shtml#input']# # 平阴

        for url in urls:
            time.sleep(0.1)
            self.parseUrl(url)


if __name__ == '__main__':
    spider = Weather_china()
    spider.workOn()