趁着课余时间自学了一下爬虫,保存数据到文件还没做,注释的话在后面会慢慢加上,代码用到的json文件在这:中国天气网大部分城市代码(为了不让大家掏钱下载,就用代码块形式展现出来了)
# coding=utf-8
# @Author: yzh
# @Description: 爬取南宁市一周天气预报
# @Date: 2020/8/24 10:35
# @File: weather_forecast.py
import json
import csv
import random
import time
import requests
from bs4 import BeautifulSoup
# 访问标识
UserAgentList = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 Edg/84.0.522.63',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
]
# 获取html页面内容
def getHtmlText(url):
try:
# 请求网页
UserAgent = {'user-agent': random.choice(UserAgentList)}
response = requests.get(url, headers=UserAgent)
response.raise_for_status()
# 设置编码格式,防止乱码
response.encoding = response.apparent_encoding
return response.text
except requests.exceptions.ConnectionError as e:
return ''
# 获取数据
def getData(html):
soup = BeautifulSoup(html, 'html.parser')
try:
# 日期
dates = soup.select('li.sky > h1')
# 天气
weathers = soup.select("li.sky > p.wea")
# 温度
temperatures = soup.select("li.sky > p.tem")
# 风向
wins = soup.select("li > p.win > em")
# 风力等级
win_levels = soup.select("li > p.win > i")
# 封装数据
data = [{
'dates': dates, 'weathers': weathers,
'temperatures': temperatures,
'wins': wins, 'win_levels': win_levels
}]
return data[0]
except:
return ''
# 打印数据
# def printData(data):
# print("{1:{0}^7}{2:{0}^8}{3:{0}^7}{4:{0}^8}{5:{0}^10}".format(
# chr(12288), "日期", "天气", "温度", "风向", "风力"))
# for i in range(7):
# date = data['dates'][i].text
# weather = data['weathers'][i].text
# highestTemper = data['temperatures'][i].find('span').text
# lowestTemper = data['temperatures'][i].find('i').text
# temperature = highestTemper + '/' + lowestTemper
# dayWin = data['wins'][i].find_all('span')[0].get('title')
# nightWin = data['wins'][i].find_all('span')[1].get('title')
# win = dayWin + '-' + nightWin
# win_level = data['win_levels'][i].text
# print("{1:{0}^3}{2:{0}^10}{3:{0}^6}{4:{0}^14}{5:{0}^3}".format(
# chr(12288), date, weather, temperature, win, win_level))
# 保存到文本中
def saveData(city_name, country_name, data):
# 创建文件对象(可以追加)
f = open('weather_forecast.csv', 'a+', encoding='utf-8', newline='')
# 基于文件对象构建csv写入对象
csv_writer = csv.writer(f)
# 写入省份信息
csv_writer.writerow(["=" * 20 + city_name + "省 " + country_name + "市 " + " 一周天气预报" + "=" * 20])
# 写入对应天气信息内容
csv_writer.writerow(["{1:{0}^7}{2:{0}^8}{3:{0}^7}{4:{0}^8}{5:{0}^10}".format(
chr(12288), "日期", "天气", "温度", "风向", "风力")])
for i in range(7):
date = data['dates'][i].text
weather = data['weathers'][i].text
highestTemper = data['temperatures'][i].find('span').text
lowestTemper = data['temperatures'][i].find('i').text
temperature = highestTemper + '/' + lowestTemper
dayWin = data['wins'][i].find_all('span')[0].get('title')
nightWin = data['wins'][i].find_all('span')[1].get('title')
win = dayWin + '-' + nightWin
win_level = data['win_levels'][i].text
csv_writer.writerow(["{1:{0}^3}{2:{0}^10}{3:{0}^6}{4:{0}^14}{5:{0}^3}".format(
chr(12288), date, weather, temperature, win, win_level)])
# 关闭文件流
f.close()
# 读取存储着中国所有省份城市名的json文件
def load_json_file(file_name):
with open(file_name, 'r', encoding='UTF-8') as f:
city_json = json.load(f)
f.close()
return city_json
def main():
file_name = 'contry_code.json'
city_json = load_json_file(file_name)
for city in city_json['城市代码']:
print("=" * 24 + "省份: " + city['省'] + " 一周天气预报" + "=" * 24)
for country in city['市']:
print("=" * 32 + country['市名'] + "市" + "=" * 32)
url = 'http://www.weather.com.cn/weather/%s.shtml' % country['编码']
html = getHtmlText(url)
result = getData(html)
if result == '':
continue
# 保存数据
saveData(city['省'], country['市名'], result)
# 打印数据到控制台
# printData(result)
# 休眠10秒,不给人家服务器造成太大压力
time.sleep(10)
print('')
if __name__ == '__main__':
main()
结果: