使用爬虫爬取天气历史数据（https://lishi.tianqi.com/）

码破苍穹

已于 2022-09-29 12:37:43 修改

阅读量4.1k

点赞数 4

分类专栏： Python及其库使用相关文章标签： python 爬虫开发语言

于 2021-08-03 17:24:35 首次发布

本文链接：https://blog.csdn.net/leokingszx/article/details/119354678

版权

Python及其库使用相关专栏收录该内容

68 篇文章

订阅专栏

目标：获取历史天气查询|历史天气预报查询|历史气温查询|过去天气查询_历史天气查询网这个页面中所有城市在2021.1~2021.7的历史天气数据

一、获取爬虫头：

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import os
import random
from bs4 import BeautifulSoup
import time
import socket
import re

import pandas as pd
import os

proxies_ = {
  'http': "你的proxy代理",
  'https':"你的proxy代理",
}

def get_header():
    headers = {
    'User-Agent': ua.random
    }
    
    return headers

from fake_useragent import UserAgent
 
ua = UserAgent(path='./fake_useragent.json')  #使用了本地的user_agent库。fake_useragent.json文件网上有。
 
print(ua.chrome)

二、获取所有城市名：

target = "http://lishi.tianqi.com/"
req = requests.get(url=target, headers=get_header(),timeout=60)

soup = BeautifulSoup(req.text, 'html.parser')   


import re
#根据数据结果设置正则匹配方式
get_city_pinyin  = """(?<=href=").*(?=/index.html)"""
get_city_chinese = """(?<=>).*(?=</a>)"""

get_city_pinyin_pattern = re.compile(get_city_pinyin)
get_city_chinese_pattern = re.compile(get_city_chinese)


#返回拼音形式的城市名列表和他们的中文名称
def get_citys(soup):
    pinyin_list = []
    chinese_name_list = []
    data_table =  soup.find_all('table')[0]
    city_columns = data_table.find_all('a')
    for data_item in city_columns:
        data_item = str(data_item)
        pinyin = get_city_pinyin_pattern.findall(data_item)
        if (len(pinyin) == 0): continue
        pinyin = pinyin[0]
        chinese_name = get_city_chinese_pattern.findall(data_item)[0]
        pinyin_list.append(pinyin)
        chinese_name_list.append(chinese_name)
    return [pinyin_list, chinese_name_list]


res = get_citys(soup)

三、获取各个城市的历史天气数据：

#获取的某个月的数据
def get_cur_month_data(soup):
    d = soup.find_all("li")
    month_data_list = []
    for item1 in d:
        item2 = item1.find_all("div")
        if ("""<div class="th200">""" in str(item2)):
            day_list = []
            for item in item2:
                item = str(item)
                res = get_day_res_pattern.findall(item)[0]
                day_list.append(res)
            month_data_list.append(day_list)
    return month_data_list

四、建立循环：

pinyin_list = res[0]  #这里是res是城市列表名。由res = get_citys(soup)得到
chinese_name_list = res[1]

city_pinyin_chinese = dict()

for i in range(3, len(pinyin_list)):  #朱日和苏右旗，跳过
    pinyin = pinyin_list[i]
    chinese = chinese_name_list[i]
    city_pinyin_chinese[chinese] = pinyin

pinyin_list = pinyin_list[2:]
random.shuffle(pinyin_list)  #打乱顺序



def get_now_time():
    return time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))



import time

#控制爬虫频率
def random_sleep():
    A = 1.72
    B = 5.19
    a = random.uniform(A,B)
    C = 3#随机数的精度round(数值，精度)
    res = round(a,C)
    time.sleep(res)

import time
#长睡
def long_sleep():
    A = 32.74
    B = 64.75
    a = random.uniform(A,B)
    C = 3#随机数的精度round(数值，精度)
    res = round(a,C)
    time.sleep(res)

import pickle
def save_variable(v,filename):
    f=open(filename,'wb')
    pickle.dump(v,f)
    f.close()


def load_variable(filename):
    f=open(filename,'rb')
    r=pickle.load(f)
    f.close()
    return r


processed_city_month = set()   #记录已经爬取的城市名和月份，以便重跑时跳过
error_city_month = set()       #记录出错的城市名和月份，以便重跑




get_day_res="""(?<=">).*(?=</div>)"""
get_day_res_pattern = re.compile(get_day_res)


month_list = ["20210" + str(i) for i in range(1, 8)] #2021.1~2021.7

city_datas = dict()

c = 0
print ("爬虫启动时间=", get_now_time())
start_time = time.time()

for i in range(0, len(pinyin_list)):
    name = pinyin_list[i]
    
    if name not in city_datas:
        city_datas[name] = dict()
        
    for month in month_list:
        query_item = str(name) + "|" + str(month)
        if (query_item in processed_city_month):continue
        target = """https://lishi.tianqi.com/""" + str(name) + """/"""+str(month) + ".html"
#         print (target)
        try:
            req = requests.get(url=target,headers=get_header(),timeout=60,proxies=proxies_,verify=False)
        except Exception as e:
            print (query_item, e)
            error_city_month.add(query_item)
            continue
        soup = BeautifulSoup(req.text, 'html.parser')
        month_data_list = get_cur_month_data(soup)
        city_datas[name][month] = month_data_list 
        processed_city_month.add(query_item)
        c += 1
        if (c % 10 == 0):
            cur_time = time.time()
            remaining_time = (len(pinyin_list) * len(month_list) - c) * (cur_time - start_time) / c
            remaining_minute = round(remaining_time / 60, 5)
            print (len(city_datas), remaining_minute)
        random_sleep()
        if (c % 1001 == 0):  #适当长睡
            long_sleep()
        if (c % 5000 == 0):   #定时保存
            index = int(c / 5000)
            file_name = "city_datas_"+ str(index) + ".pkl"
            save_variable(city_datas,file_name)


file_name = "city_datas_"+ str("all") + ".pkl"
save_variable(city_datas,file_name)