爬取秦皇岛天气情况

一.bs4方法解析

源代码

import requests
import re
import time
import xlwt
from bs4 import BeautifulSoup
import pandas as pd
from fake_useragent import UserAgent
date_box = []
max_temp = []
min_temp = []
weh = []
wind = []
for year in range(2011,2022):
    for month in range(1,13):
        print(f'爬取{year}年{month}月的天气数据')
        if month<10:
            month_str='0'+str(month)
        else:
            month_str=str(month)
        url='https://lishi.tianqi.com/qinhuangdao/'+str(year)+month_str+'.html'
        headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        }
        html=requests.get(url,headers=headers).text
        bs=BeautifulSoup(html,'html.parser')
        data=bs.find_all(class_='thrui')
        date=re.compile('class="th200">(.*?)</')
        tem=re.compile('class="th140">(.*?)</')
        time1=re.findall(date,str(data))
        print(time1)
        time.sleep(0.05)
        for item in time1:
            date_box.append(item[:10])
        temp=re.findall(tem, str(data))
        print(temp)
        temp_len=len(temp)
        temp_len2=int(temp_len/4)
        for i in range(temp_len2):
            max_temp.append(temp[i * 4 + 0])
            min_temp.append(temp[i * 4 + 1])
            weh.append(temp[i * 4 + 2])
            wind.append(temp[i * 4 + 3])
        datas = pd.DataFrame({'日期':date_box,'最高温度': max_temp, '最低温度': min_temp, '天气': weh, '风向': wind})
        print(datas)
        datas.to_csv('./秦皇岛天气情况.csv',index=False,sep=',',encoding='utf-8_sig')

二.xpath方法解析(我更习惯用,感觉比较方便)

源代码

from lxml import etree
from fake_useragent import UserAgent
import time
import requests
import csv
import random
headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        }
names=["年","月","日","最低温(℃)","最高温(℃)","天气","风向","级数"]
with open('秦皇岛.csv', 'a', newline='', encoding='utf-8-sig') as fp:
        writer = csv.writer(fp)
        writer.writerow(names)
month=["01","02","03","04","05","06","07","08","09","10","11","12"]
year=["19","20","21"]
for y in year:
    for m in month:
        url='https://lishi.tianqi.com/qinhuangdao/20{}{}.html'.format(y,m)
        print(url)
        response=requests.get(url=url,headers=headers)
        page_text=response.text
        tree=etree.HTML(page_text)
        div_list=tree.xpath('/html/body/div[7]/div[1]/div[4]/ul/li')
        for div in div_list:
            times=div.xpath('./div/text()')[0].split(" ")[0].split("-")
            ma=div.xpath('./div/text()')[1].replace("℃","")
            mi=div.xpath('./div/text()')[2].replace("℃","")
            weather=div.xpath('./div/text()')[3]
            wind=div.xpath('./div/text()')[4].split(" ")[0]
            num=div.xpath('./div/text()')[4].split(" ")[1].replace("级","")
            times.append(ma)
            times.append(mi)
            times.append(weather)
            times.append(wind)
            times.append(num)
            with open('秦皇岛.csv', 'a', newline='', encoding='utf-8-sig') as fp:
                writer = csv.writer(fp)
                writer.writerow(times)
        
    time.sleep(float(format(random.uniform(0,2), '.2f')))

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

py爱好者~

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值