所要用到的库
import requests
from lxml import etree
import xlwt
访问URL
这里我们要用xpath来解析数据,所以我们返回的网页数据格式为html
没学过xpath的可以看看这篇博客,写的还是很详细的
xpath链接
https://blog.csdn.net/u013332124/article/details/80621638
def ask_url(url):
# 伪装请求头
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
response = requests.get(url, headers=header)
# 以html格式返回数据
html = etree.HTML(response.text)
return html
解析数据
所要爬取的数据有一下
先是爬取每个省份URL的尾部,然后一个个访问。
def get_data(url):
html = ask_url(url)
base_url = 'http://www.weather.com.cn'
province_name = [] # 省份名字
# province_url = [] # 省份的URL
city_name = [] # 城市名称
weather = [] # 天气现象
wind_direction = [] # 风向
wind_power = [] # 风力
max_temperature = [] # 最高温
min_temperature = [] # 最低温
data = [] # 数据汇总
province_name_decode = html.xpath('//div[@class="lqcontentBoxheader"]//a[@target="_blank"]/text()')
for i in range(len(province_name_decode)):
# print(province_name_decode[i].encode('raw_unicode_escape').decode())
province_name.append(province_name_decode