天气爬取程序

最新推荐文章于 2024-06-30 22:44:29 发布

人马座α星

最新推荐文章于 2024-06-30 22:44:29 发布

阅读量1.6k

点赞数 1

分类专栏： python

本文链接：https://blog.csdn.net/weixin_43128028/article/details/105063714

版权

python 专栏收录该内容

15 篇文章

订阅专栏

以下是911天气爬取的程序
爬取内容包括时间，图片，天气，温度，湿度，风力，风级，降水量，体感温度，云量，不过最近911没数据了，于是又换了天气爬取网站，对应文章：https://blog.csdn.net/weixin_43128028/article/details/108391587

import requests
from  bs4 import BeautifulSoup
from collections import defaultdict
from dateutil.relativedelta import relativedelta
from datetime import datetime


class weather_data:
    def __init__(self,city,start_year,end_year,start_month=1,end_month = 12):
        """

        :param city: 需爬取的城市全拼
        :param start_year: 爬取开始年份
        :param end_year: 爬取结束年份
        :param start_month: 爬取开始月份
        :param end_month: 爬取结束月份
        """
        self.city = city
        self.start_time = datetime.strptime(f"{start_year}-{start_month}",'%Y-%m')
        self.end_time = datetime.strptime(f"{end_year}-{end_month}",'%Y-%m')

    def _get_original_html(self):
        """
            网页爬取
        """

        url = f"https://tianqi.911cha.com/{self.city}/{self.start_time.year}-{self.start_time.month}.html"
        print(url)
        header = {
            "User-Agent": "……"}#填写自己浏览器内容
        response = requests.get(url, headers=header)
        return response.content.decode("utf-8")

    def _parse_data(self):
        #一次解析一个月
        soup = BeautifulSoup(self.html,"html.parser")
        data = defaultdict(dict)
        for n, tr in enumerate(soup.find_all("tr")):
            if n == 0:
                continue

            if n%2!=0:
                date = tr.find("a").get_text()
                #创建日期字典
                #[时间，图片，天气，温度，湿度，风力，风级，降水量，体感温度，云量]
                data[date]["Day"] = {str(self.start_time.year)+'-'+key:con.get_text() for key,con in zip(['time','image','weather','temperature','humidity','wind_force','wind_scale',
                                                                 'precipitation','sendible_temperature','cloud_amount'],tr.find_all("td"))}

            else:
                data[date]["Night"] = {key: con.get_text() for key, con in zip(
                    ['time', 'image', 'weather', 'temperature', 'humidity', 'wind_force', 'wind_scale',
                     'precipitation', 'sendible_temperature', 'cloud_amount'], tr.find_all("td"))}
        return data

    def main(self):

        data = []
        while self.start_time<=self.end_time:
            self.html = self._get_original_html()
            data.append(self._parse_data())
            self.start_time+=relativedelta(months=1)

        return data



if __name__ == "__main__":
    T = weather_data(city="jinan",start_year=2017,end_year=2020,start_month=1,end_month=2)
    with open('weather_dict.txt','w',encoding='UTF-8') as f:
        for line in T.main():
            f.writelines(str(line))