简介:爬取字段
'城市', '日期', '最高气温', '最低气温', '天气', '风向'
网址:历史天气查询|历史天气预报查询|历史气温查询|过去天气查询_历史天气查询网 (tianqi.com)
备注:该网站偶尔打不开,换个浏览器试一试
1、数据截图
2、替换url
2.1 操作流程
2.2 替换url
3、代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/12/21 0021 14:09
# @Author : 程序员ck君
# @File : gh2.py
# @Software: PyCharm
import time
import requests
import csv
from lxml import etree
import re
if __name__ == "__main__":
'''
https://lishi.tianqi.com/
'''
date = time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime())
with open(f'{date}天气网_plus.csv', 'w', encoding='utf8', newline='') as filename:
csvwriter = csv.DictWriter(filename, fieldnames=[
'城市',
'日期',
'最高气温',
'最低气温',
'天气',
'风向'
])
csvwriter.writeheader()
# U-A 伪装
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}
# 请求地址 url_num 是数量 // ulr_data
for index in range(2011, 2023):
for month in range(1, 13):
month_parm = f'0{month}'
if month >= 10:
month_parm = month
url_index = f"http://lishi.tianqi.com/guiyang1/{index}{month_parm}.html"
print(f'---------------------爬取 ** 贵阳 {index}{month_parm} **---------------')
time.sleep(1) # 休眠1秒
try:
index_html = requests.get(url=url_index, headers=headers).text
tree = etree.HTML(index_html)
for li in range(1, 30):
list = []
# /html/body/div[7]/div[1]/div[4]/ul/li[1]
# 日期
data_1 = tree.xpath(f'/html/body/div[7]/div[1]/div[4]/ul/li[{li}]/div[1]/text()')[0]
list.append(data_1)
# 最高温度
data_2 = tree.xpath(f'/html/body/div[7]/div[1]/div[4]/ul/li[{li}]/div[2]/text()')[0]
list.append(data_2)
# 最低温度
data_3 = tree.xpath(f'/html/body/div[7]/div[1]/div[4]/ul/li[{li}]/div[3]/text()')[0]
list.append(data_3)
# 天气
data_4 = tree.xpath(f'/html/body/div[7]/div[1]/div[4]/ul/li[{li}]/div[4]/text()')[0]
list.append(data_4)
# 风向
data_5 = tree.xpath(f'/html/body/div[7]/div[1]/div[4]/ul/li[{li}]/div[5]/text()')[0]
list.append(data_5)
dict = {
'城市': '贵阳',
'日期': list[0],
'最高气温': list[1],
'最低气温': list[2],
'天气': list[3],
'风向': list[4]
}
csvwriter.writerow(dict)
except:
pass
print('-------------------------------程序结束----------------------------------------------')