15-搜狗天气信息爬取

最新推荐文章于 2024-10-01 20:02:09 发布

chuiai8582

最新推荐文章于 2024-10-01 20:02:09 发布

阅读量217

点赞数

文章标签：爬虫 python

原文链接：https://my.oschina.net/pansy0425/blog/2998676

版权

目的：按照区域对全国各个区域的天气信息进行爬取

结果呈现：屏幕显示爬取完成；相应文件夹的生成

注：机房的空调坏了，真的很冷，暖水捂子根本离不开手。南京真的很冷.......

注：我的学校是南京信息工程大学，应该是全国气象最牛B的学校了啊！虽然我也没学啥气象......但是学校的气象背景这么浓厚，我就爬点气象数据吧，假装自己也搞过气象！~

搜狗视频官网首页：http://tianqi.sogou.com/cityindex

#下面为本实例的爬虫代码，若有问题可以给我留言，或者有更好的解决方法也可以私信我~

import requests
from bs4 import BeautifulSoup
import os
import csv

def get_page(url):
    headers={'user-agent':'Mozilla/5.0'}
    try:
        r=requests.get(url,headers=headers)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except Exception as e:
        print(e)

def get_cate(url):
    cate_name=[]
    url_list=[]
    html=get_page(url)
    soup=BeautifulSoup(html,'html.parser')
    dls=soup.find_all('dl',{'class':{'list-col'}})
    for dl in dls:
        cate1=dl('dt')[0].text.strip()
        for a in dl('dd')[0]('a'):
            cate=cate1+'_'+a.text.strip()
            cate_name.append(cate)
            href=a['href']
            t_url=url+href
            url_list.append(t_url)
    return url_list,cate_name

def get_info(url):
    html=get_page(url)
    soup=BeautifulSoup(html,'html.parser')

    div_jr=soup.find('div',{'class':{'c-left'}})
    div1=div_jr('div')[0]
    num=div1.find('span',{'class':{'num'}}).text.strip()  #今日总体气温
    p=div1.find('p',{'class':{'text'}}).text.strip()       #今日天气情况

    div2=div_jr.find('div',{'class':{'row2'}})
    date = div2.find('a', {'class': {'date'}}).text.strip()  #今日日期
    wind=div2.find('span',{'class':{'wind'}}).text.strip()   #今日风况
    hundity=div2.find('span',{'class':{'wind'}}).text.strip()  #今日湿度
    live=div2.find('span',{'class':{'liv-text'}}).text.strip().replace('\n',' ')#空气指数

    div3=div_jr.find('div', {'class': {'row3'}})
    yj=div3.text.strip()         #今日预警
    if yj =='':
        yj='无'
    jr_weather=[num,p,date,wind,hundity,live,yj] #今日天气
    name1=['今日总体气温','今日天气情况','今日日期','今日风况','今日湿度','空气指数','今日预警']

    div_seven=soup.find_all('div',{'class':{'c-right'}})[1]
    ul=div_seven('ul')[0]
    seven_weather=[]
    for li in ul('li'):
        date=li.find('p',{'class':{'date'}}).text.strip() #时间
        text=li.find('p',{'class':{'text'}}).text.strip()  #星期
        des=li.find('p',{'class':{'des'}}).text.strip()  #天气情况
        wind=li.find('p',{'class':{'wind'}}).text.strip()  #风级
        seven_weather.append(date)
        seven_weather.append(text)
        seven_weather.append(des)
        seven_weather.append(wind)
    temp=div_seven.find('div',{'class':{'r-temp'}})
    high=temp['data-high'].split(',')
    low=temp['data-low'].split(',')
    index=0
    name2=[]
    for i in range(len(high)):
        index1=2+i+index
        index2=3+i+index
        seven_weather.insert(index1,high[i])
        seven_weather.insert(index2,low[i])
        index=index+5
        name2.append('日期')
        name2.append('星期')
        name2.append('最高气温')
        name2.append('最低气温')
        name2.append('天气情况')
        name2.append('风级')

    weather=jr_weather+seven_weather
    name=name1+name2
    return weather,name

def save_to_csv(weather,name,cate):
    file=cate.split('_')[0]+'.csv'
    weather.insert(0,cate.split('_')[1])
    name.insert(0,'城市')
    with open(file,'a+',encoding='utf-8',newline="")as csv_file:
        csv_write=csv.writer(csv_file)
        if os.path.getsize(file)==0:
            csv_write.writerow(name)
        csv_write.writerow(weather)
    print('{}-->信息保存成功！'.format(cate))

if __name__ == '__main__':
    start_url='http://tianqi.sogou.com/cityindex'
    url_list, cate_name=get_cate(start_url)
    for i in range(len(url_list)):
        weather, name=get_info(url_list[i])
        save_to_csv(weather,name,cate_name[i])

屏幕显示：【部分】