包含编程籽料、学习路线图、爬虫代码、安装包等!【点击领取】
前言
在日常开发中,爬取天气数据是一个常见的需求,无论是为了个人项目还是商业应用。本文将详细介绍如何使用Python爬取天气数据,并保存为结构化格式。我们将以中国天气网为例,讲解完整的爬虫实现过程。
一、准备工作
1.1 所需工具
Python 3.x
requests库(发送HTTP请求)
BeautifulSoup4(解析HTML)
pandas(数据处理)
lxml(解析器)
1.2 安装依赖库
pip install requests beautifulsoup4 pandas lxml
二、分析目标网站
我们以中国天气网(http://www.weather.com.cn)为例。首先需要分析网页结构:
打开中国天气网,搜索目标城市(如北京)
检查网页源代码,找到天气数据所在位置
分析URL规律,便于构造请求
三、基础爬取实现
3.1 获取网页内容
import requests
from bs4 import BeautifulSoup
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except Exception as e:
print(f"获取网页失败: {e}")
return None
3.2 解析天气数据
def parse_weather(html):
soup = BeautifulSoup(html, 'lxml')
weather_data = []
# 获取城市名称
city = soup.find('div', class_='crumbs fl').find_all('a')[-1].text
# 获取7天天气预报
days = soup.find('div', id='7d').find('ul', class_='t clearfix').find_all('li')
for day in days:
date = day.find('h1').text
weather = day.find('p', class_='wea').text
temp = day.find('p', class_='tem')
high_temp = temp.find('span').text
low_temp = temp.find('i').text
wind = day.find('p', class_='win').find('i').text
weather_data.append({
'城市': city,
'日期': date,
'天气': weather,
'最高温度': high_temp,
'最低温度': low_temp,
'风力': wind
})
return weather_data
四、完整代码实现
import requests
from bs4 import BeautifulSoup
import pandas as pd
class WeatherSpider:
def __init__(self):
self.base_url = "http://www.weather.com.cn"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def get_city_code(self, city_name):
"""获取城市代码"""
url = f"{self.base_url}/weather/"
try:
response = requests.get(url, headers=self.headers)
soup = BeautifulSoup(response.text, 'lxml')
city_link = soup.find('a', text=city_name)
if city_link:
return city_link['href'].split('/')[-1].replace('.shtml', '')
return None
except Exception as e:
print(f"获取城市代码失败: {e}")
return None
def get_weather_data(self, city_code):
"""获取天气数据"""
url = f"{self.base_url}/weather/{city_code}.shtml"
html = self.get_html(url)
if html:
return self.parse_weather(html)
return []
def get_html(self, url):
"""获取网页内容"""
try:
response = requests.get(url, headers=self.headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except Exception as e:
print(f"获取网页失败: {e}")
return None
def parse_weather(self, html):
"""解析天气数据"""
soup = BeautifulSoup(html, 'lxml')
weather_data = []
# 获取城市名称
city = soup.find('div', class_='crumbs fl').find_all('a')[-1].text
# 获取7天天气预报
days = soup.find('div', id='7d').find('ul', class_='t clearfix').find_all('li')
for day in days:
date = day.find('h1').text
weather = day.find('p', class_='wea').text
temp = day.find('p', class_='tem')
high_temp = temp.find('span').text if temp.find('span') else ''
low_temp = temp.find('i').text if temp.find('i') else ''
wind = day.find('p', class_='win').find('i').text if day.find('p', class_='win') else ''
weather_data.append({
'城市': city,
'日期': date,
'天气': weather,
'最高温度': high_temp,
'最低温度': low_temp,
'风力': wind
})
return weather_data
def save_to_csv(self, data, filename):
"""保存为CSV文件"""
df = pd.DataFrame(data)
df.to_csv(filename, index=False, encoding='utf_8_sig')
print(f"数据已保存到 {filename}")
if __name__ == '__main__':
spider = WeatherSpider()
city_name = input("请输入城市名称: ")
city_code = spider.get_city_code(city_name)
if city_code:
weather_data = spider.get_weather_data(city_code)
if weather_data:
spider.save_to_csv(weather_data, f"{city_name}_weather.csv")
else:
print("未获取到天气数据")
else:
print("未找到该城市")
五、进阶优化
5.1 使用多线程提高效率
from concurrent.futures import ThreadPoolExecutor
def multi_city_weather(cities):
with ThreadPoolExecutor(max_workers=5) as executor:
results = list(executor.map(get_single_city_weather, cities))
all_data = [item for sublist in results for item in sublist]
return all_data
5.2 添加异常处理
def safe_get_weather(city_code):
try:
return get_weather_data(city_code)
except Exception as e:
print(f"获取{city_code}天气数据时出错: {e}")
return []
5.3 使用代理IP
proxies = {
'http': 'http://your_proxy:port',
'https': 'https://your_proxy:port'
}
response = requests.get(url, headers=headers, proxies=proxies)
六、数据存储与可视化
6.1 保存到数据库
import sqlite3
def save_to_db(data, db_name='weather.db'):
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS weather (
city TEXT,
date TEXT,
weather TEXT,
high_temp TEXT,
low_temp TEXT,
wind TEXT
)
''')
for item in data:
cursor.execute(''
INSERT INTO weather VALUES (?, ?, ?, ?, ?, ?)
''', (item['城市'], item['日期'], item['天气'],
item['最高温度'], item['最低温度'], item['风力']))
conn.commit()
conn.close()
6.2 使用Matplotlib可视化
import matplotlib.pyplot as plt
def plot_weather(data):
dates = [item['日期'] for item in data]
highs = [int(item['最高温度'].replace('℃', '')) for item in data]
lows = [int(item['最低温度'].replace('℃', '')) for item in data]
plt.figure(figsize=(10, 6))
plt.plot(dates, highs, label='最高温度', marker='o')
plt.plot(dates, lows, label='最低温度', marker='o')
plt.xlabel('日期')
plt.ylabel('温度(℃)')
plt.title(f"{data[0]['城市']}7天天气预报")
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
七:总结
本文详细介绍了使用Python爬取天气数据的完整流程,包括:
网页请求与响应处理
HTML解析与数据提取
数据存储与可视化
爬虫优化技巧
最后:
希望你编程学习上不急不躁,按照计划有条不紊推进,把任何一件事做到极致,都是不容易的,加油,努力!相信自己!
文末福利
最后这里免费分享给大家一份Python全套学习资料,希望能帮到那些不满现状,想提升自己却又没有方向的朋友,也可以和我一起来学习交流呀。
包含编程资料、学习路线图、源代码、软件安装包等!【点击这里领取!】
① Python所有方向的学习路线图,清楚各个方向要学什么东西
② 100多节Python课程视频,涵盖必备基础、爬虫和数据分析
③ 100多个Python实战案例,学习不再是只会理论
④ 华为出品独家Python漫画教程,手机也能学习