python爬取aqistudy数据

最新推荐文章于 2024-07-30 17:21:38 发布

DanielMaster

最新推荐文章于 2024-07-30 17:21:38 发布

阅读量2.1k

点赞数 1

分类专栏：开发工具文章标签： python selenium

本文链接：https://blog.csdn.net/a805814077/article/details/117359964

版权

开发工具专栏收录该内容

8 篇文章 7 订阅

订阅专栏

由于网上相关的代码大多数都失效了，无法使用，这里更新一版可以使用的代码，

截止 2021-05-28亲测可用，这里以爬取山西省的11个城市的aqi数据为例

get_daily_data.py

import time
from selenium import webdriver
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import urllib.request as request


# pip install selenium==2.48.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
#  按网页url的格式生成一段时间内的日期
# get_year_months(2020, 4, 2021, 4)
# [202004, 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104]
def get_year_months(start_year, start_month, end_year, end_month):
    start_year, start_month, end_year, end_month = [int(i) for i in [start_year, start_month, end_year, end_month]]
    year_months = []
    if start_year < end_year:
        for year in range(start_year, end_year + 1):
            if year == start_year:
                if start_month > 12 or start_month < 1:
                    raise ValueError
                else:
                    for month in range(start_month, 13):
                        year_months.append(year * 100 + month)
            elif year == end_year:
                if end_month > 12 or end_month < 1:
                    raise ValueError
                else:
                    for month in range(1, end_month + 1):
                        year_months.append(year * 100 + month)
            else:
                for month in range(1, 13):
                    year_months.append(year * 100 + month)
    elif start_year == end_year:
        if start_month <= end_month:
            for month in range(start_month, end_month + 1):
                year_months.append(start_year * 100 + month)

    return year_months


if __name__ == '__main__':
    start = time.clock()
    # ------------------------------------------基本设置-----------------------------------------------
    base_aqi_url = r'https://www.aqistudy.cn/historydata/daydata.php?'
    # city_set = ['太原', '大同', '朔州', '忻州', '阳泉', '吕梁', '晋中', '长治', '晋城', '临汾', '运城']
    city_set = ['太原']
    for k in range(0, len(city_set)):
        city_chinese_name = city_set[k]
        # 将城市中文名进行URL编码
        city_name = request.quote(city_chinese_name)
        # 拿到目标日期的月份
        year_months = get_year_months(2021, 5, 2021, 5)  # 包括最后年的最后月
        encoding = 'gbk'
        # 最大进程数
        executor_num = 10
        # ---------------------------------crawl-----------------------------------------------------------
        city_aqi_url = base_aqi_url + 'city=%s' % city_name
        start_time = time.time()
        driver = webdriver.PhantomJS(r'C:\phantomjs-2.1.1-windows\bin\phantomjs.exe')
        # 这里使用多进程并行编程库里面的ProcessPoolExecutor
        with ProcessPoolExecutor(executor_num) as executor:
            for year_month in year_months:
                city_year_month_url = city_aqi_url + '&month=%d' % year_month
                # 在url中带入月份
                driver.get(city_year_month_url)
                time.sleep(1)
                # 利用pandas读取网页中的表格
                dfs = pd.read_html(driver.page_source, header=0)[0]
                time.sleep(0.5)
                dfs[1] = str(city_chinese_name)
                # 数据落地
                dfs.to_csv('data_test/' + (str(city_chinese_name) + '.csv'), header=None, index=None, mode='a+',
                           encoding='utf_8_sig')
driver.quit()
end = time.clock()
print('Running time: %s Seconds' % round((end - start), 2))

注意事项

selenium版本过高会出现警告’Selenium support for PhantomJS has been deprecated, please use headless '，降低版本即可
```
pip install selenium==2.48.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
```
phantomjs下载链接

https://phantomjs.org/download.html

结果截图

在这里插入图片描述