python爬虫—天气数据

最新推荐文章于 2024-06-05 11:29:15 发布

pandashero

最新推荐文章于 2024-06-05 11:29:15 发布

阅读量1.3k

点赞数

分类专栏： python爬虫教程文章标签： python

本文链接：https://blog.csdn.net/qq_35511580/article/details/78529365

版权

python爬虫教程专栏收录该内容

2 篇文章 0 订阅

订阅专栏

采用python 的bs4和urllib从网站：http://www.tianqihoubao.com/lishi/beijing/month/201710.html，爬取了2011年1月到2017年11月13日天气数据，主要有以下四个字段：data(时间）、weather(天气）、temperature(温度）、wind(风力风向），四个字段的数据

'''
Created on 2017-11-13

@author: chen
'''
from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv

 
def get_all_weather_url():
    response = urlopen("http://www.tianqihoubao.com/lishi/beijing/month/201101.html")
    bs_obj = BeautifulSoup(response.read(), "lxml")
    months = bs_obj.find("div", {"class": "months"})
    month_all = months.find_all("a")
    for month in month_all:
        yield month.attrs


# url处理
def get_page_url_weather():
    for url in get_all_weather_url():
        helf_url = url["href"]
        title = url["title"]
        weather_url = "http://www.tianqihoubao.com/" + str(helf_url)
        yield weather_url, title


def get_weather_data():
    url_set = set([])
    for url, title in get_page_url_weather():
        if url not in url_set:
            url_set.add(url)
            weather_content = urlopen(url).read()
            weather_page_obj = BeautifulSoup(weather_content, "lxml")
            tbody_page = weather_page_obj.table
            tr_weather_page = tbody_page.find_all("tr")
            for tr_each in tr_weather_page:
                #            print(tr_each)
                td_weather = tr_each.find_all("td")
                data = td_weather[0].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                weather = td_weather[1].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                temperature = td_weather[2].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                wind = td_weather[3].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                yield data, weather, temperature, wind

            else:
                continue


def main():
    with open(r"C:\Users\chen\Desktop\北京天气数据每日更新.csv", "w+", newline="") as file:
        writer = csv.writer(file)
        tem = 1
        for data, weather, temperature, wind in get_weather_data():
            day_weather = [data, weather, temperature, wind]
            writer.writerow(day_weather)
            print("第" + str(tem) + "次写入成功")
            tem += 1
        print("写入完毕")


if __name__ == '__main__':
    main()

2、空气质量，包含以下字段的数据

data, Quality, AQI_data, AQI_rank, PM2_5, PM10, SO2, NO2, CO2, O3

'''
Created on 2017-11-14

@author: chen
'''

from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv


def get_all_weather_url():
    response = urlopen("http://www.tianqihoubao.com/aqi/beijing-201708.html")
    bs_obj = BeautifulSoup(response.read(), "lxml")
    months = bs_obj.find("div", {"class": "box p"})
    month_all = months.find_all("li")
    for month in month_all:
        yield month.a


# url处理
def get_page_url_weather():
    for url in get_all_weather_url():
        helf_url = url["href"]
        weather_url = "http://www.tianqihoubao.com/" + str(helf_url)
        yield weather_url


def get_weather_data():
    url_set = set([])
    for url in get_page_url_weather():
        if url not in url_set:
            url_set.add(url)
            weather_content = urlopen(url).read()
            weather_page_obj = BeautifulSoup(weather_content, "lxml")
            tbody_page = weather_page_obj.table
            tr_weather_page = tbody_page.find_all("tr")
            for tr_each in tr_weather_page:
                td_weather = tr_each.find_all("td")
                print(len(td_weather), url)
                data = td_weather[0].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                Quality = td_weather[1].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                AQI_data = td_weather[2].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                AQI_rank = td_weather[3].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                PM2_5 = td_weather[4].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                PM10 = td_weather[5].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                SO2 = td_weather[6].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                NO2 = td_weather[7].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                CO2 = td_weather[8].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                O3 = td_weather[9].get_text(" ", strip=True).replace("\r\n", "").replace(" ", "")
                yield data, Quality, AQI_data, AQI_rank, PM2_5, PM10, SO2, NO2, CO2, O3
            else:
                continue


def main():
    with open(r"C:\Users\chen\Desktop\北京每日空气质量更新.csv", "w+", newline="") as file:
        writer = csv.writer(file)
        tem = 1
        for data, Quality, AQI_data, AQI_rank, PM2_5, PM10, SO2, NO2, CO2, O3 in get_weather_data():
            day_weather = [data, Quality, AQI_data, AQI_rank, PM2_5, PM10, SO2, NO2, CO2, O3]
            writer.writerow(day_weather)
            print("第" + str(tem) + "次写入成功")
            tem += 1
        print("写入完毕")


if __name__ == '__main__':
    main()

pandashero

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
python爬虫—天气数据

采用python 的bs4和urllib从网站：http://www.tianqihoubao.com/lishi/beijing/month/201710.html，爬取了2011年1月到2017年11月13日天气数据，主要有以下四个字段：data(时间）、weather(天气）、temperature(温度）、wind(风力风向），四个字段的数据'''Created on 2017-1...
复制链接

扫一扫