目标:获取历史天气查询|历史天气预报查询|历史气温查询|过去天气查询_历史天气查询网 这个页面中所有城市在2021.1~2021.7的历史天气数据
一、获取爬虫头:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import os
import random
from bs4 import BeautifulSoup
import time
import socket
import re
import pandas as pd
import os
proxies_ = {
'http': "你的proxy代理",
'https':"你的proxy代理",
}
def get_header():
headers = {
'User-Agent': ua.random
}
return headers
from fake_useragent import UserAgent
ua = UserAgent(path='./fake_useragent.json') #使用了本地的user_agent库。fake_useragent.json文件网上有。
print(ua.chrome)
二、获取所有城市名:
target = "http://lishi.tianqi.com/"
req = requests.get(url=target, headers=get_header(),timeout=60)
soup = BeautifulSoup(req.text, 'html.parser')
import re
#根据数据结果设置正则匹配方式
get_city_pinyin = """(?<=href=").*(?=/index.html)"""
get_city_chinese = """(?<=>).*(?=</a>)"""
get_city_pinyin_pattern = re.compile(get_city_pinyin)
get_city_chinese_pattern = re.compile(get_city_chinese)
#返回拼音形式的城市名列表和他们的中文名称
def get_citys(soup):
pinyin_list = []
chinese_name_list = []
data_table = soup.find_all('table')[0]
city_columns = data_table.find_all('a')
for data_item in city_columns:
data_item = str(data_item)
pinyin = get_city_pinyin_pattern.findall(data_item)
if (len(pinyin) == 0): continue
pinyin = pinyin[0]
chinese_name = get_city_chinese_pattern.findall(data_item)[0]
pinyin_list.append(pinyin)
chinese_name_list.append(chinese_name)
return [pinyin_list, chinese_name_list]
res = get_citys(soup)
三、获取各个城市的历史天气数据:
#获取的某个月的数据
def get_cur_month_data(soup):
d = soup.find_all("li")
month_data_list = []
for item1 in d:
item2 = item1.find_all("div")
if ("""<div class="th200">""" in str(item2)):
day_list = []
for item in item2:
item = str(item)
res = get_day_res_pattern.findall(item)[0]
day_list.append(res)
month_data_list.append(day_list)
return month_data_list
四、建立循环:
pinyin_list = res[0] #这里是res是城市列表名。由res = get_citys(soup)得到
chinese_name_list = res[1]
city_pinyin_chinese = dict()
for i in range(3, len(pinyin_list)): #朱日和苏右旗,跳过
pinyin = pinyin_list[i]
chinese = chinese_name_list[i]
city_pinyin_chinese[chinese] = pinyin
pinyin_list = pinyin_list[2:]
random.shuffle(pinyin_list) #打乱顺序
def get_now_time():
return time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
import time
#控制爬虫频率
def random_sleep():
A = 1.72
B = 5.19
a = random.uniform(A,B)
C = 3#随机数的精度round(数值,精度)
res = round(a,C)
time.sleep(res)
import time
#长睡
def long_sleep():
A = 32.74
B = 64.75
a = random.uniform(A,B)
C = 3#随机数的精度round(数值,精度)
res = round(a,C)
time.sleep(res)
import pickle
def save_variable(v,filename):
f=open(filename,'wb')
pickle.dump(v,f)
f.close()
def load_variable(filename):
f=open(filename,'rb')
r=pickle.load(f)
f.close()
return r
processed_city_month = set() #记录已经爬取的城市名和月份,以便重跑时跳过
error_city_month = set() #记录出错的城市名和月份,以便重跑
get_day_res="""(?<=">).*(?=</div>)"""
get_day_res_pattern = re.compile(get_day_res)
month_list = ["20210" + str(i) for i in range(1, 8)] #2021.1~2021.7
city_datas = dict()
c = 0
print ("爬虫启动时间=", get_now_time())
start_time = time.time()
for i in range(0, len(pinyin_list)):
name = pinyin_list[i]
if name not in city_datas:
city_datas[name] = dict()
for month in month_list:
query_item = str(name) + "|" + str(month)
if (query_item in processed_city_month):continue
target = """https://lishi.tianqi.com/""" + str(name) + """/"""+str(month) + ".html"
# print (target)
try:
req = requests.get(url=target,headers=get_header(),timeout=60,proxies=proxies_,verify=False)
except Exception as e:
print (query_item, e)
error_city_month.add(query_item)
continue
soup = BeautifulSoup(req.text, 'html.parser')
month_data_list = get_cur_month_data(soup)
city_datas[name][month] = month_data_list
processed_city_month.add(query_item)
c += 1
if (c % 10 == 0):
cur_time = time.time()
remaining_time = (len(pinyin_list) * len(month_list) - c) * (cur_time - start_time) / c
remaining_minute = round(remaining_time / 60, 5)
print (len(city_datas), remaining_minute)
random_sleep()
if (c % 1001 == 0): #适当长睡
long_sleep()
if (c % 5000 == 0): #定时保存
index = int(c / 5000)
file_name = "city_datas_"+ str(index) + ".pkl"
save_variable(city_datas,file_name)
file_name = "city_datas_"+ str("all") + ".pkl"
save_variable(city_datas,file_name)