1.天气预报数据爬取

首先需要通过接口返回的数据得到各个城市的气象站码,并保存csv文件。实现代码如下:

#获取城市气象站信息(编码,名称,经纬度,该函数只需要执行一次即可)
import csv
import json
from datetime import time
import requests
def main():
    url = "https://weather.cma.cn/api/map/weather/1?t=1679576188443"
    res = requests.get(url)
    res.encoding = 'utf-8'
    data = res.text
    data = json.loads(data)
    code = open(f"data/city_code.csv", mode="w", newline='', encoding="utf-8")
    city_code = csv.writer(code)
    for city in data["data"]["city"]:
        cityCode = city[0]
        pro = city[1]
        latitude = city[4]
        longitude = city[5]
        city_code.writerow([cityCode,pro,latitude,longitude]) #获取城市气象站信息(编码,名称,经纬度,该函数只需要执行一次即可)
if __name__ == '__main__':
    main()

然后通过读取保存的csv文件的内容,通过气象站码获取返回的html文件,使用BeautifulSoup来进行解析,并将得到的结果保存csv文件,随后读取csv文件,将结果存储进mysql中,使用定时调度来进行定时执行。完整代码如下:

# coding=gbk
# 获取小时数据以及实时数据
import csv
from datetime import datetime
import json
import re
import time
import pandas as pd
import mysql.connector
import  requests
from apscheduler.schedulers.blocking import BlockingScheduler
from  bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
def getWeekWeather():
    now = open(f"data/week_weather.csv", mode="w", newline='', encoding="utf-8")
    csvw_hour = csv.writer(now)
    codes = pd.read_csv("city_code.csv", header=None, names=['code', 'city', 'latitude', 'longitude'])
    for code in codes['code']:
        hourWeatherUrl = "https://weather.cma.cn/web/weather/" + code + ".html"
        print(hourWeatherUrl)
        s = requests.Session()
        s.mount('https://', HTTPAdapter(max_retries=4))
        try:
            resonese=(requests.get(hourWeatherUrl))
            reslut=resonese.content.decode("utf-8")
            s=BeautifulSoup(reslut,'html5lib')
            city = s.find(class_='breadcrumb mybreadcrumb')
            cityList = city.find_all("li")
            city = cityList[2].a.text #省份
            pro = cityList[3].text #城市
            weatherInfo = []
            dayActive = s.find(class_='pull-left day actived')
            weatherInfo.append(dayActive)
            day = s.find_all("div",class_="pull-left day")
            weatherInfo += day
            try:
                hourTable = s.find_all("table",class_="hour-table")
                o = 0
                for table in hourTable:
                    hourData = []
                    trs = table.find_all("tr")
                    for trI,trV in enumerate(trs):
                        tds = trV.find_all("td")[1:]
                        for index,v in enumerate(tds):
                            hourData.append(v)
                    info = weatherInfo[o].find_all("div", class_="day-item")
                    o += 1
                    string = info[0].text.replace('\n', '').replace(' ', '')
                    week = re.sub(u'([^\u4e00-\u9fa5])', '', string)  # 星期
                    date = re.sub(u"([^\u0030-\u0039\/])", "", string)  # 日期
                    src1 = info[1].img.get('src').replace('\n', '').strip()  # 日间天气情况图片地址
                    weather1 = info[2].text.replace('\n', '').strip()  # 日间天气情况
                    wind1 = info[3].text.replace('\n', '').strip()  # 日间风向
                    windPower1 = info[4].text.replace('\n', '').strip()  # 日间风力
                    high = info[5].find(class_="high").text.replace('\n', '').strip()  # 最高气温
                    high = int(high.split("℃")[0])
                    low = info[5].find(class_="low").text.replace('\n', '').strip()  # 最低气温
                    low = int(low.split("℃")[0])
                    src2 = info[6].img.get('src').replace('\n', '').strip()  # 夜间天气情况图片地址
                    weather2 = info[7].text.replace('\n', '').strip()  # 夜间天气情况
                    wind2 = info[8].text.replace('\n', '').strip()  # 夜间风向
                    windPower2 = info[9].text.replace('\n', '').strip()  # 夜间风力
                    for a in range(0, 8):
                        try:
                            time = hourData[a].text  # 时刻
                            src3 = hourData[a + 8].img.get('src')  # 所处时刻天气图片地址
                            temperature = hourData[a + 16].text.split("℃")[0]  # 所处时刻气温
                            temperature = float(temperature)
                            rainfall = hourData[a + 24].text  # 所处时刻降水
                            if (rainfall == '无降水'):
                                rainfall = 0
                            else:
                                rainfall = hourData[a + 24].text.split("mm")[0]
                                rainfall = float(rainfall)
                            windSpeed = hourData[a + 32].text.split("m/s")[0]  # 所处时刻风速
                            windSpeed = float(windSpeed)
                            windDirection = hourData[a + 40].text  # 所处时刻风向
                            pressure = hourData[a + 48].text.split("hPa")[0]  # 所处时刻气压
                            pressure = float(pressure)
                            humidity = hourData[a + 56].text.split("%")[0]  # 所处时刻湿度
                            humidity = float(humidity)
                            cloudAmount = hourData[a + 64].text.split("%")[0]  # 所处时刻云量
                            cloudAmount = float(cloudAmount)
                            csvw_hour.writerow(
                                [code,city, pro, date, week, src1, weather1, wind1, windPower1,
                                 high,low, src2, weather2, wind2,windPower2, time, src3, temperature,
                                 rainfall, windSpeed, windDirection, pressure, humidity,cloudAmount])
                        except Exception as e:
                            time = '9999'  # 时刻
                            src3 = '9999'  # 所处时刻天气图片地址
                            temperature = 9999  # 所处时刻气温
                            rainfall = 9999  # 所处时刻降水
                            windSpeed = 9999 # 所处时刻风速
                            windDirection = '9999'  # 所处时刻风向
                            pressure = 9999  # 所处时刻气压
                            humidity = 9999  # 所处时刻湿度
                            cloudAmount = 9999  # 所处时刻云量
                            csvw_hour.writerow(
                                [code, city, pro, date, week, src1, weather1, wind1, windPower1,
                                 high, low, src2,weather2, wind2,windPower2, time, src3, temperature,
                                 rainfall, windSpeed, windDirection, pressure,humidity,cloudAmount])
            except Exception as e:
                print("---",e)
        except Exception as e:
            print('错误:',e)
def getNowWeather():
    now = open(f"data/now_weather.csv", mode="w", newline='', encoding="utf-8")
    csvw_now = csv.writer(now)
    codes = pd.read_csv("city_code.csv", header=None, names=['code', 'city', 'latitude', 'longitude'])
    for code in codes['code']:
        nowWeatherUrl = "https://weather.cma.cn/api/now/"+code
        s = requests.Session()
        s.mount('https://', HTTPAdapter(max_retries=4))
        try:
            resonese = (requests.get(nowWeatherUrl,timeout = 30))
            reslut = resonese.content.decode("utf-8")
            data = json.loads(reslut)
            try:
                code = data["data"]['location']['id']
                location_path = data["data"]['location']['path'].replace(" ","").split(',')
                data_now = data["data"]["now"]
                alarm = data["data"]["alarm"]
                province = location_path[1]
                city = location_path[2]
                precipitation, temperature, pressure, humidity, windDirection, windDirectionDegree,windSpeed, windScale  = data_now.values()
                if alarm != []:
                    title = alarm[0]['title']
                    signaltype = alarm[0]['signaltype']
                    signallevel = alarm[0]['signallevel']
                    effective = alarm[0]['effective']
                else:
                    title = '9999'
                    signaltype = '9999'
                    signallevel = '9999'
                    effective = '9999'
                    print(code,province,city,precipitation, temperature, pressure, humidity,windDirection,
                                   windDirectionDegree, windSpeed, windScale,title,signaltype,signallevel,effective)
                csvw_now.writerow([code,province,city,precipitation, temperature, pressure, humidity,windDirection,
                                   windDirectionDegree, windSpeed, windScale,title,signaltype,signallevel,effective])
            except Exception as e:
                print("error",e)
        except Exception as e:
            print("错误:",e)
def saveWeekMysql():
    # 读取CSV文件数据到Pandas DataFrame对象
    hour_df = pd.read_csv('data/week_weather.csv', header=None,
                     names=['code', 'pro', 'city', 'date', 'week', 'src1', 'weather1', 'wind1',
                            'windPower1', 'high', 'low', 'src2', 'weather2', 'wind2', 'windPower2',
                            'time', 'src3', 'temperature', 'rainfall', 'windSpeed', 'windDirection',
                            'pressure', 'humidity', 'cloudAmount'])
    # now_df = pd.read_csv('data/now_weather.csv',header=None,
    #                      names=['code','province','city','precipitation', 'temperature', 'pressure', 'humidity','windDirection',
    #                                'windDirectionDegree', 'windSpeed', 'windScale','title','signaltype','signallevel','effective']
    #                      )
    # 创建MySQL连接并获取游标
    # '192.168.70.204'
    try:
        cnx = mysql.connector.connect(user='root', password='123456', host='localhost', database='weather')
        cursor = cnx.cursor()
        # 创建插入数据的SQL语句
        delete = ("DELETE FROM week_weather")
        cursor.execute(delete)
        add_row_hour = ("INSERT INTO week_weather "
                   "(code,pro, city, date, week, src1, weather1, wind1, windPower1, high, low, src2, weather2, wind2,windPower2, time, src3, temperature, rainfall, windSpeed, windDirection, pressure, humidity,cloudAmount) "
                   "VALUES (%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s)")
        # add_row_now = ("INSERT INTO now_weather "
        #            "(code,province,city,precipitation, temperature, pressure, humidity,windDirection,windDirectionDegree, windSpeed, windScale,title,signaltype,signallevel,effective) "
        #            "VALUES (%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s)")
        # 迭代DataFrame中的每一行,并将数据插入到MySQL中
        for index, row in hour_df.iterrows():
            hour_data = (row['code'], row['pro'], row['city'], row['date'], row['week'], row['src1'], row['weather1'],
                    row['wind1'], row['windPower1'], row['high'], row['low'], row['src2'], row['weather2'],
                    row['wind2'], row['windPower2'], row['time'], row['src3'], row['temperature'], row['rainfall'],
                    row['windSpeed'], row['windDirection'], row['pressure'], row['humidity'], row['cloudAmount']
                    )
            cursor.execute(add_row_hour, hour_data)  # 执行插入操作
        # for index, row in now_df.iterrows():
        #     now_data = (row['code'], row['province'], row['city'], row['precipitation'], row['temperature'], row['pressure'],
        #                 row['humidity'],row['windDirection'], row['windDirectionDegree'], row['windSpeed'], row['windScale'],
        #                 row['title'], row['signaltype'],row['signallevel'], row['effective']
        #             )
        #     cursor.execute(add_row_now, now_data)  # 执行插入操作
        # # 确认修改并关闭游标
        cnx.commit()
        cursor.close()
        # 关闭连接
        cnx.close()
    except mysql.connector.Error as err:
        print("MySQL错误:{}".format(err))
        # 处理错误
    except IOError as err:
        print("I/O错误:{}".format(err))
        # 处理错误
    # 成功完成
    print("数据已成功导入MySQL!")
def saveNowMysql():
    now_df = pd.read_csv('data/now_weather.csv',header=None,
                         names=['code','province','city','precipitation', 'temperature', 'pressure', 'humidity','windDirection',
                                   'windDirectionDegree', 'windSpeed', 'windScale','title','signaltype','signallevel','effective']
                         )
    # 创建MySQL连接并获取游标
    # '192.168.70.204'
    try:
        cnx = mysql.connector.connect(user='root', password='123456', host='localhost', database='weather')
        cursor = cnx.cursor()
        delete = ("DELETE FROM now_weather")
        cursor.execute(delete)
        # 创建插入数据的SQL语句
        add_row_now = ("INSERT INTO now_weather "
                   "(code,province,city,precipitation, temperature, pressure, humidity,windDirection,windDirectionDegree, windSpeed, windScale,title,signaltype,signallevel,effective) "
                   "VALUES (%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s)")
        # 迭代DataFrame中的每一行,并将数据插入到MySQL中
        for index, row in now_df.iterrows():
            now_data = (row['code'], row['province'], row['city'], row['precipitation'], row['temperature'], row['pressure'],
                        row['humidity'],row['windDirection'], row['windDirectionDegree'], row['windSpeed'], row['windScale'],
                        row['title'], row['signaltype'],row['signallevel'], row['effective']
                    )
            cursor.execute(add_row_now, now_data)  # 执行插入操作
        # # 确认修改并关闭游标
        cnx.commit()
        cursor.close()
        # 关闭连接
        cnx.close()
    except mysql.connector.Error as err:
        print("MySQL错误:{}".format(err))
        # 处理错误
    except IOError as err:
        print("I/O错误:{}".format(err))
        # 处理错误
    # 成功完成
    print("数据已成功导入MySQL!")
# def main():
#      getWeekWeather()
#      saveWeekMysql()
#      getNowWeather()
#      saveNowMysql()
# if __name__ == '__main__':
#     s = time.time()
#     main()
#     e = time.time()
#     print('总用时:', (e - s) / 3600)
def now():
    getNowWeather()
    saveNowMysql()
def hour():
    getWeekWeather()
    saveWeekMysql()
def func():
    # 创建调度器BlockingScheduler()
    scheduler = BlockingScheduler()
    #实时天气数据爬取并保存数据库
    scheduler.add_job(now, 'interval', minutes=15, id='now')
    # 一周每日小时天气数据爬取并保存数据库
    scheduler.add_job(hour, 'interval', hours=2, id='hour')
    scheduler.start()
func()

对于一周每小时的数据,还可以使用selenium来实现,但是由于selenium爬取太慢,即使使用多进程进行爬取,时间依旧很长,因此,只适合采集一次性的数据。完整代码如下:

import datetime
from apscheduler.schedulers.background  import BlockingScheduler
from selenium import webdriver
import csv
import time
from threading import Thread
from selenium.webdriver.common.by import By

scheduler = BlockingScheduler(timezone='Asia/Shanghai')
f = open(f"全国天气信息.csv", mode="w", newline='', encoding="utf-8")
csvw = csv.writer(f)
def getData(start,end):
    chrome_driver = r'G:\pythonProject\venv\Scripts\chromedriver.exe'
    # 关闭左上方 Chrome 正受到自动测试软件的控制的提示
    options = webdriver.ChromeOptions()
    options.add_experimental_option('useAutomationExtension', False)
    options.add_experimental_option("excludeSwitches", ['enable-automation'])
    browser = webdriver.Chrome(executable_path=chrome_driver, options=options)
    # browser.implicitly_wait(1)
    browser.set_page_load_timeout(60)  # 设置页面加载超时
    browser.set_script_timeout(60)  # 设置页面异步js执行超时
    try:
        browser.get('https://weather.cma.cn/web/weather/58367.html')
        time.sleep(1)
    except Exception as e:
        print("加载超时-----")
        browser.refresh();
    # 鼠标移动到指定位置
    for p in range(start, end):
        province_id = start + 1
        browser.find_element(By.XPATH, f'//*[@id="cityPosition"]/div[3]/button').click()
        browser.find_element(By.XPATH, f'//*[@id="cityPosition"]/div[3]/ul/li[{p}]').click()
        time.sleep(1)
        count = 0
        for c in range(1, 150):
            count += 1
            time.sleep(1)
            if c != 1:
                browser.find_element(By.XPATH, f'//*[@id="cityPosition"]/div[5]/button').click()
            try:
                browser.find_element(By.XPATH, f'//*[@id="cityPosition"]/div[5]/ul/li[{c}]/a').click()
            except Exception as msg1:
                break
            location_province_name = browser.find_element(By.XPATH,f'//*[@id="cityPosition"]/div[3]/button/span[1]').text  # 省地点
            location_city_name = browser.find_element(By.XPATH, f'//*[@id="cityPosition"]/div[5]/button').text  # 市地点
            date = datetime.datetime.now().strftime('%H:%M:%S')
            print(province_id,location_province_name, location_city_name, date, count,"----------")
            for j in range(0, 7):
                browser.find_element(By.XPATH, f'//*[@id="dayList"]/div[{j + 1}]').click()
                time.sleep(1)
                weather_time = browser.find_element(By.XPATH, f'//*[@id="dayList"]/div[{j + 1}]/div[1]').text  # 时间
                weather_time_day = weather_time[0:3]  # 日期
                weather_time_week = weather_time[4:]  # 星期
                weather_highest_a = browser.find_element(By.XPATH,f'//*[@id="dayList"]/div[{j + 1}]/div[3]').text  # 当天天气情况最高
                weather_highest_b = browser.find_element(By.XPATH,f'//*[@id="dayList"]/div[{j + 1}]/div[4]').text  # 当天天气情况最高
                weather_highest_c = browser.find_element(By.XPATH,f'//*[@id="dayList"]/div[{j + 1}]/div[5]').text  # 当天天气情况最高
                weather_highest_air = browser.find_element(By.XPATH,f'//*[@id="dayList"]/div[{j + 1}]/div[6]/div/div[1]').text  # 当天天气情况最高
                weather_lowest_a = browser.find_element(By.XPATH,f'//*[@id="dayList"]/div[{j + 1}]/div[8]').text  # 当天天气情况最低
                weather_lowest_b = browser.find_element(By.XPATH,f'//*[@id="dayList"]/div[{j + 1}]/div[9]').text  # 当天天气情况最低
                weather_lowest_c = browser.find_element(By.XPATH,f'//*[@id="dayList"]/div[{j + 1}]/div[10]').text  # 当天天气情况最低
                weather_lowest_air = browser.find_element(By.XPATH,f'//*[@id="dayList"]/div[{j + 1}]/div[6]/div/div[2]').text  # 当天天气情况最低
                try:
                    detail_time = browser.find_element(By.XPATH,f'//*[@id="hourTable_{j}"]/tbody/tr[1]/td[2]').text  # 时间
                    detail_air = browser.find_element(By.XPATH,f'//*[@id="hourTable_{j}"]/tbody/tr[3]/td[2]').text  # 气温
                    detail_rain = browser.find_element(By.XPATH,f'//*[@id="hourTable_{j}"]/tbody/tr[4]/td[2]').text  # 降水
                    detail_wind_speed = browser.find_element(By.XPATH,f'//*[@id="hourTable_{j}"]/tbody/tr[5]/td[2]').text  # 风速
                    detail_wind_direction = browser.find_element(By.XPATH,f'//*[@id="hourTable_{j}"]/tbody/tr[6]/td[2]').text  # 风向
                    detail_air_pressure = browser.find_element(By.XPATH,f'//*[@id="hourTable_{j}"]/tbody/tr[7]/td[2]').text  # 气压
                    detail_humidity = browser.find_element(By.XPATH,f'//*[@id="hourTable_{j}"]/tbody/tr[8]/td[2]').text  # 温度
                    detail_cloud_cover = browser.find_element(By.XPATH,f'//*[@id="hourTable_{j}"]/tbody/tr[9]/td[2]').text  # 云量
                except Exception as e:
                    detail_time = " "
                    detail_air = " "
                    detail_rain = " "
                    detail_wind_speed = " "
                    detail_wind_direction = " "
                    detail_air_pressure = " "
                    detail_humidity = " "
                    detail_cloud_cover = " "
                csvw.writerow(
                    [province_id,location_province_name, location_city_name, weather_highest_a, weather_highest_b,
                     weather_highest_c, weather_highest_air,
                     weather_lowest_a, weather_lowest_b, weather_lowest_c, weather_lowest_air,
                     weather_time_day, weather_time_week, detail_time, detail_air, detail_rain, detail_wind_speed,
                     detail_wind_direction, detail_air_pressure, detail_humidity, detail_cloud_cover
                     ])
# 每天凌晨8点00分00秒执行一次scheduled_job()装饰器实现
@scheduler.scheduled_job('cron', day_of_week='*', hour=16, minute='13', second='00')
def tick():
    # 开启4个进程,传入爬取的页码范围
    thead_list = []
    t1 = Thread(target=getData, args=(1, 10))
    t1.start()
    t2 = Thread(target=getData, args=(10, 17))
    t2.start()
    t3 = Thread(target=getData, args=(17, 24))
    t3.start()
    t4 = Thread(target=getData, args=(24, 35))
    t4.start()
    thead_list.append(t1)
    thead_list.append(t2)
    thead_list.append(t3)
    thead_list.append(t4)
    for t in thead_list:
        t.join()
try:
    scheduler.start()
    print('定时任务成功执行')
except Exception as e:
    scheduler.shutdown()
    print('定时任务执行失败')
finally:
    exit()

对于历史数据的爬取,由于上面的网站并没有提供历史数据,因此爬取的是另外的网站的数据,使用的也是selenium来开启多进程爬取,由于爬取速度太慢,并且年限太多,可以根据自己的需要去更改。完整代码如下:

# coding=gbk
#多进程爬取全国各个市的历史天气数据
import csv
import re
# from sched import scheduler
from threading import Thread
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
f = open("data/" + f"history_weather.csv", mode="w", newline='', encoding="utf-8")
csvw = csv.writer(f)
def getData(satrt,end):
    chrome_driver = r'G:\pythonProject\venv\Scripts\chromedriver.exe'
    # 关闭左上方 Chrome 正受到自动测试软件的控制的提示
    options = webdriver.ChromeOptions()
    options.add_experimental_option('useAutomationExtension', False)
    options.add_experimental_option("excludeSwitches", ['enable-automation'])
    browser = webdriver.Chrome(executable_path=chrome_driver, options=options)
    browser.get('https://tianqi.2345.com/wea_history/58362.htm')
    time.sleep(5)
    oneCity = ["北京", "天津", "上海", "重庆", "香港", "澳门", "台湾"]
    #start:开始省份位置,end:结束省份位置
    for i in range(satrt, end):
        try:
            if i != 1:
                browser.find_element(By.XPATH,f'//*[@id="js_cascadeDisable"]').click()
                try:
                    browser.find_element(By.XPATH,'//*[@id="js_cascadeBox"]/div[1]/div[1]/span').click()
                except Exception:
                    time.sleep(2)
                    browser.find_element(By.XPATH, '//*[@id="js_cascadeDisable"]').click()
                    browser.find_element(By.XPATH, '//*[@id="js_cascadeBox"]/div[1]/div[1]/span').click()
                browser.find_element(By.XPATH,f'//*[@id="js_provinceCascade"]/li[{i}]/a').click()
                province = browser.find_element(By.XPATH, f'//*[@id="js_cascadeBox"]/div[1]/div[1]/span').text
                province = re.sub(u'([^\u4e00-\u9fa5])', '', province)
                browser.find_element(By.XPATH,f'//*[@id="js_addDefaultCity"]').click()
                print('省:',province)
            else:
                browser.find_element(By.XPATH,'//*[@id="js_cascadeDisable"]').click()
                time.sleep(1)
                browser.find_element(By.XPATH,'//*[@id="js_cascadeBox"]/div[1]/div[1]/span').click()
                browser.find_element(By.XPATH,f'//*[@id="js_provinceCascade"]/li[{i}]/a').click()
                province = browser.find_element(By.XPATH, f'//*[@id="js_cascadeBox"]/div[1]/div[1]/span').text
                province = re.sub(u'([^\u4e00-\u9fa5])', '', province)
                browser.find_element(By.XPATH, f'//*[@id="js_addDefaultCity"]').click()
                print('省:',province)
        except Exception as e:
            # print("erro", e)
            break
        #遍历所有市
        for j in range(1, 2):
            time.sleep(1)
            try:
                if j != 1:
                    browser.find_element(By.XPATH,f'//*[@id="js_cascadeDisable"]').click()
                    try:
                        browser.find_element(By.XPATH,f'//*[@id="js_cascadeBox"]/div[1]/div[2]/span').click()
                    except Exception:
                        time.sleep(2)
                        browser.find_element(By.XPATH, '//*[@id="js_cascadeDisable"]').click()
                        browser.find_element(By.XPATH,f'//*[@id="js_cascadeBox"]/div[1]/div[2]/span').click()
                    browser.find_element(By.XPATH, f'//*[@id="js_cityCascade"]/li[{j}]/a').click()
                    city = browser.find_element(By.XPATH, f'//*[@id="js_cascadeBox"]/div[1]/div[2]/span').text
                    city = re.sub(u'([^\u4e00-\u9fa5])', '', city)
                    browser.find_element(By.XPATH,f'//*[@id="js_addDefaultCity"]').click()
                    print('市:',city)
                else:
                    # browser.find_element(By.XPATH, f'//*[@id="js_cascadeDisable"]').click()
                    # print(000)
                    browser.find_element(By.XPATH, f'//*[@id="js_cascadeDisable"]').click()
                    browser.find_element(By.XPATH,'//*[@id="js_cascadeBox"]/div[1]/div[2]/span').click()
                    if province in oneCity:
                        # print(4)
                        browser.find_element(By.XPATH, f'//*[@id="js_cityCascade"]/li/a').click()
                    else:
                        # print(5)
                        browser.find_element(By.XPATH, f'//*[@id="js_cityCascade"]/li[{j}]/a').click()
                    city = browser.find_element(By.XPATH, f'//*[@id="js_cascadeBox"]/div[1]/div[2]/span').text
                    city = re.sub(u'([^\u4e00-\u9fa5])', '', city)
                    browser.find_element(By.XPATH, f'//*[@id="js_addDefaultCity"]').click()
                    print('市:',city)
            except Exception as e:
                # print(e)
                break
            # 遍历所有区县(可根据自己要求修改参数)
            for k in range(1, 2):
                time.sleep(1)
                try:
                    if k != 1:
                        browser.find_element(By.XPATH,f'//*[@id="js_cascadeDisable"]').click()
                        try:
                            browser.find_element(By.XPATH,f'//*[@id="js_cascadeBox"]/div[1]/div[3]/span').click()
                        except Exception:
                            browser.find_element(By.XPATH, f'//*[@id="js_cascadeDisable"]').click()
                            browser.find_element(By.XPATH, f'//*[@id="js_cascadeBox"]/div[1]/div[3]/span').click()
                        browser.find_element(By.XPATH,f'//*[@id="js_countyCascade"]/li[{k}]/a').click()
                        area = browser.find_element(By.XPATH, f'//*[@id="js_cascadeBox"]/div[1]/div[3]/span').text
                        area = re.sub(u'([^\u4e00-\u9fa5])', '', area)
                        browser.find_element(By.XPATH,f'//*[@id="js_addDefaultCity"]').click()
                        print('城市:',area)
                    else:
                        browser.find_element(By.XPATH, f'//*[@id="js_cascadeDisable"]').click()
                        browser.find_element(By.XPATH,f'//*[@id="js_cascadeBox"]/div[1]/div[3]/span').click()
                        browser.find_element(By.XPATH,f'//*[@id="js_countyCascade"]/li[{k}]/a').click()
                        area = browser.find_element(By.XPATH, f'//*[@id="js_cascadeBox"]/div[1]/div[3]/span').text
                        area = re.sub(u'([^\u4e00-\u9fa5])', '', area)
                        browser.find_element(By.XPATH,f'//*[@id="js_addDefaultCity"]').click()
                        print('城市:',area)
                except Exception as e:
                    # print(e)
                    break
                #查找年份
                for m in range(3, 13):
                    try:
                        browser.find_element(By.XPATH, f'//*[@id="js_yearVal"]').click()
                        browser.find_element(By.XPATH,
                            f'/html/body/div[7]/div[2]/div[1]/div[1]/div[1]/div[3]/div[1]/div/ul/li[{m}]/a').click()
                    except Exception as e:
                        # print(e)
                        break
                    #月份
                    for y in range(1, 13):
                        time.sleep(1.5)
                        try:
                            browser.find_element(By.XPATH, f'//*[@id="js_monthVal"]').click()
                            browser.find_element(By.XPATH, f'/html/body/div[7]/div[2]/div[1]/div[1]/div[1]/div[3]/div[2]/div/ul/li[{y}]/a').click()
                        except Exception as e:
                            # print(e)
                            break
                        #查找每个月下面所有的天气天数
                        for t in range(2, 33):
                            time.sleep(0.5)
                            try:
                                weather_time = browser.find_element(By.XPATH,
                                                                    f'//*[@id="weather-history"]/table/tbody/tr[{t}]/td[1]').text  # 时间
                                date = weather_time.split(" ")[0]
                                week = weather_time.split(" ")[1]
                                weather_highest_air = browser.find_element(By.XPATH,
                                                                           f'//*[@id="weather-history"]/table/tbody/tr[{t}]/td[2]').text  # 当天天气情况最高
                                weather_highest_air = weather_highest_air.split("°")[0]
                                weather_lowest_air = browser.find_element(By.XPATH,
                                                                          f'//*[@id="weather-history"]/table/tbody/tr[{t}]/td[3]').text  # 当天天气情况最低
                                weather_lowest_air = weather_lowest_air.split("°")[0]
                                weather = browser.find_element(By.XPATH,
                                                               f'//*[@id="weather-history"]/table/tbody/tr[{t}]/td[4]').text  # 天气
                                state_of_the_wind = browser.find_element(By.XPATH,
                                                                         f'//*[@id="weather-history"]/table/tbody/tr[{t}]/td[5]').text  # 风力风向
                                state_of_the_wind = state_of_the_wind.replace("微风", "0-2级").replace("级级","级")
                                wind = re.split(r'(?<=\D)(?=\d)', state_of_the_wind, maxsplit=1)
                                wind_direction = wind[0]
                                wind_power = wind[1]
                                csvw.writerow(
                                    [province, city, date, week, weather_highest_air, weather_lowest_air, weather,wind_direction,wind_power])
                            except Exception as e:
                                break
                        print(province,city,f'{m+2010}年-{y}月-{t-1}日')

# 每天凌晨8点00分00秒执行一次scheduled_job()装饰器实现
# @scheduler.scheduled_job('cron', day_of_week='*', hour=8, minute='00', second='00')
def tick():
    # 开启4个进程,传入爬取的页码范围
    thead_list = []
    t1 = Thread(target=getData, args=(1, 10))
    t1.start()
    t2 = Thread(target=getData, args=(10, 17))
    t2.start()
    t3 = Thread(target=getData, args=(17, 24))
    t3.start()
    t4 = Thread(target=getData, args=(24, 35))
    t4.start()
    thead_list.append(t1)
    thead_list.append(t2)
    thead_list.append(t3)
    thead_list.append(t4)
    for t in thead_list:
        t.join()
def main():
    tick()

if __name__ == '__main__':
    main()

  • 3
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
对于天气预报数据爬取爬取到的数据需要进行清洗和处理,以确保数据的准确性和可用性。以下是一个示例: 1. 数据清洗:去除重复数据、缺失值、异常值等。 ```python import pandas as pd # 读取爬取到的数据文件 df = pd.read_csv('weather_data.csv') # 去除重复数据 df.drop_duplicates(inplace=True) # 去除缺失值 df.dropna(inplace=True) # 去除异常值 df = df[(df['temperature'] > -50) & (df['temperature'] < 50)] # 保存处理后的数据 df.to_csv('cleaned_weather_data.csv', index=False) ``` 2. 数据处理:将数据转换为需要的格式,如日期格式、数值格式等。 ```python import pandas as pd # 读取爬取到的数据文件 df = pd.read_csv('weather_data.csv') # 转换日期格式 df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S') # 转换数值格式 df['temperature'] = df['temperature'].astype(int) # 保存处理后的数据 df.to_csv('processed_weather_data.csv', index=False) ``` 对于论文数据清洗,也需要对数据进行清洗和处理,以确保数据的准确性和可用性。以下是一个示例: 1. 数据清洗:去除重复数据、缺失值、异常值等。 ```python import pandas as pd # 读取爬取到的数据文件 df = pd.read_csv('paper_data.csv') # 去除重复数据 df.drop_duplicates(inplace=True) # 去除缺失值 df.dropna(subset=['title', 'authors', 'year'], inplace=True) # 去除异常值 df = df[(df['year'] >= 2000) & (df['year'] <= 2021)] # 保存处理后的数据 df.to_csv('cleaned_paper_data.csv', index=False) ``` 2. 数据处理:将数据转换为需要的格式,如日期格式、数值格式等。 ```python import pandas as pd # 读取爬取到的数据文件 df = pd.read_csv('paper_data.csv') # 转换日期格式 df['publication_date'] = pd.to_datetime(df['publication_date'], format='%Y-%m-%d') # 保存处理后的数据 df.to_csv('processed_paper_data.csv', index=False) ``` 需要注意的是,在数据清洗和处理过程中,需要根据实际情况进行相应的处理,以确保数据的准确性和可用性。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值