python爬取aqistudy数据

由于网上相关的代码大多数都失效了,无法使用,这里更新一版可以使用的代码,

截止 2021-05-28亲测可用,这里以爬取山西省的11个城市的aqi数据为例

get_daily_data.py

import time
from selenium import webdriver
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import urllib.request as request


# pip install selenium==2.48.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
#  按网页url的格式生成一段时间内的日期
# get_year_months(2020, 4, 2021, 4)
# [202004, 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104]
def get_year_months(start_year, start_month, end_year, end_month):
    start_year, start_month, end_year, end_month = [int(i) for i in [start_year, start_month, end_year, end_month]]
    year_months = []
    if start_year < end_year:
        for year in range(start_year, end_year + 1):
            if year == start_year:
                if start_month > 12 or start_month < 1:
                    raise ValueError
                else:
                    for month in range(start_month, 13):
                        year_months.append(year * 100 + month)
            elif year == end_year:
                if end_month > 12 or end_month < 1:
                    raise ValueError
                else:
                    for month in range(1, end_month + 1):
                        year_months.append(year * 100 + month)
            else:
                for month in range(1, 13):
                    year_months.append(year * 100 + month)
    elif start_year == end_year:
        if start_month <= end_month:
            for month in range(start_month, end_month + 1):
                year_months.append(start_year * 100 + month)

    return year_months


if __name__ == '__main__':
    start = time.clock()
    # ------------------------------------------基本设置-----------------------------------------------
    base_aqi_url = r'https://www.aqistudy.cn/historydata/daydata.php?'
    # city_set = ['太原', '大同', '朔州', '忻州', '阳泉', '吕梁', '晋中', '长治', '晋城', '临汾', '运城']
    city_set = ['太原']
    for k in range(0, len(city_set)):
        city_chinese_name = city_set[k]
        # 将城市中文名进行URL编码
        city_name = request.quote(city_chinese_name)
        # 拿到目标日期的月份
        year_months = get_year_months(2021, 5, 2021, 5)  # 包括最后年的最后月
        encoding = 'gbk'
        # 最大进程数
        executor_num = 10
        # ---------------------------------crawl-----------------------------------------------------------
        city_aqi_url = base_aqi_url + 'city=%s' % city_name
        start_time = time.time()
        driver = webdriver.PhantomJS(r'C:\phantomjs-2.1.1-windows\bin\phantomjs.exe')
        # 这里使用多进程并行编程库里面的ProcessPoolExecutor
        with ProcessPoolExecutor(executor_num) as executor:
            for year_month in year_months:
                city_year_month_url = city_aqi_url + '&month=%d' % year_month
                # 在url中带入月份
                driver.get(city_year_month_url)
                time.sleep(1)
                # 利用pandas读取网页中的表格
                dfs = pd.read_html(driver.page_source, header=0)[0]
                time.sleep(0.5)
                dfs[1] = str(city_chinese_name)
                # 数据落地
                dfs.to_csv('data_test/' + (str(city_chinese_name) + '.csv'), header=None, index=None, mode='a+',
                           encoding='utf_8_sig')
driver.quit()
end = time.clock()
print('Running time: %s Seconds' % round((end - start), 2))

注意事项

  • selenium版本过高会出现警告’Selenium support for PhantomJS has been deprecated, please use headless ',降低版本即可

    pip install selenium==2.48.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
    
  • phantomjs下载链接

    https://phantomjs.org/download.html

结果截图

在这里插入图片描述

评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

DanielMaster

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值