python实现全国各个省市天气信息爬取 并入库
实现方案
- 确认爬取网站 https://www.tianqi.com/chinacity.html
- 天气网站页面所有内容信息爬取
- 数据清洗
- 写入数据库
实现代码
import requests
from bs4 import BeautifulSoup
import psycopg2
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/80.0.3987.132 Safari/537.36 '
}
def get_city_list():
city_dict = {}
url = 'https://www.tianqi.com/chinacity.html'
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml', from_encoding='utf-8')
info_box = soup.find('div', class_='citybox')
province_list = info_box.find_all('h2')
for i in province_list:
city_dict[i.get_text().encode('latin1').decode('utf-8')] = i.find('a').get('href').strip('/')
two_province_list = info_box.find_all('h3')
for i in two_province_list:
city_dict[i.get_text().encode('latin1').decode('utf-8')] = i.find('a').get('href').strip('/')
return city_dict
def get_html(city, city_pinyin):
try:
url = 'https://www.tianqi.com/%s/15' % city_pinyin
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
info_box = soup.find('ul', class_='weaul')
fifteen_th_list = info_box.find_all('li')
except Exception as e:
time.sleep(1)
return_value = '''
'%s','%s','','','',''
''' % (city, city_pinyin)
print(return_value)
return return_value
for day_info in fifteen_th_list:
temperature_info = day_info.find_all('span')
weather = day_info.find('div', class_='weaul_z').get_text()
date = temperature_info[0].get_text()
min_temperature = temperature_info[2].get_text()
max_temperature = temperature_info[3].get_text()
return_value = '''
'%s','%s','2021-%s','%s','%s','%s'
''' % (city, city_pinyin, date, min_temperature, max_temperature, weather)
print(return_value)
return return_value
return_list = []
city_dict = get_city_list()
for x in city_dict.items():
if 'province' in x[1]:
continue
return_value = get_html(x[0], x[1])
return_list.append(return_value )
def sql_inset(return_list):
print('连接数据库')
conn = psycopg2.connect(database="数据库名称", user="用户名", password="密码",
host="数据库地址", port="端口")
cur = conn.cursor()
for i in return_list:
sql = '''INSERT INTO public.表名称(city, city_code, date, air_temperature_top, air_temperature_bot,
weather) VALUES (%s) ; ''' % i
cur.execute(sql)
print('开始执行 sql 提交')
conn.commit()
print('sql 提交完毕 关闭sql 连接')
conn.close()
sql_inset(return_list)