写在前面:代码注释写的比较清楚,亲测可直接使用,中间步骤需要截图啥的有点麻烦,没有找到好的截图工具(好吧是我懒了),有空的时候再说吧,如果有问题欢迎评论区提出。
一、目的:爬取哈尔滨天气信息
二、步骤及实现
# _*_ coding :utf-8 _*_
#@Time :2021/10/25 16:03
#@Author :帅哥
#@File :爬虫练习_哈尔滨天气信息爬取
#@Project :
#导入需要的库函数
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_weather(url):
# 1.发送请求获取源数据
html=requests.get(url=url,headers=headers)
#设置编码格式为自适应(.apparent_encoding可能会出错)
#查看页面编码形式方法:检查-network-ctrl+r-headers-查看Content-Type
# (.text方法返回的是字符串.content方法返回的是二进制)
html.apparent_encoding
html.encoding='gb2312'
response=html.text
#2.进行解析并处理数据(BeautifulSoup传入的第一参数为字符串类型)
soup = BeautifulSoup(response, 'lxml')
tr_list = soup.find_all("tr")
dates, conditions, temperatures = [], [], []
#从第一列开始是为了去除首行的文字
for data in tr_list[1:]:
sub_data = data.text.split()
dates.append(sub_data[0])
conditions.append("".join(sub_data[1:3]))
temperatures.append("".join(sub_data[3:6]))
weather_date = pd.DataFrame()
weather_date["日期"] = dates
weather_date["天气状况"] = conditions
weather_date["气温"] = temperatures
print(weather_date)
return weather_date
#获取爬取的url地址
def get_url(url):
html=requests.get(url=url)
html.apparent_encoding
html.encoding='gb2312'
response=html.text
soup = BeautifulSoup(response, 'lxml')
new_url=soup.select("#content > div.months > a")
href=[]
for url1 in new_url:
href.append(url1['href'])
return href
if __name__ == '__main__':
url='http://www.tianqihoubao.com/lishi/haerbin/month/202109.html'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
# ,Accept-Encoding: gzip, deflate
, 'Accept-Language': 'zh-CN,zh;q=0.9'
, 'Cache-Control': 'max-age=0'
, 'Connection': 'keep-alive'
# ,'Cookie': '__gads=ID=c667d0298be6dfc2-223adf08d7cc003d:T=1635074968:RT=1635074968:S=ALNI_MbwQOPgMPxtD3mkbTwkCbLRn9MVwg; bdshare_firstime=1635075217375; ASP.NET_SessionId=mp1chcmh2lhqev55unefng55; __tins__4560568=%7B%22sid%22%3A%201635088648576%2C%20%22vd%22%3A%201%2C%20%22expires%22%3A%201635090448576%7D; __51cke__=; __51laig__=1; Hm_lvt_f48cedd6a69101030e93d4ef60f48fd0=1635074979,1635077295,1635088649; Hm_lpvt_f48cedd6a69101030e93d4ef60f48fd0=1635088649'
, 'Host': 'www.tianqihoubao.com'
, 'Upgrade-Insecure-Requests': '1'
,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
}
href=get_url(url)
for url2 in href:
url2='http://www.tianqihoubao.com'+url2
url2=str(url2)
data=get_weather(url2)
print(data)
三、效果图