介绍
本篇文章分享用pyhon实现函数式爬虫,爬取天气网站(http://www.tianqihoubao.com/)上的天气信息数据,并将清洗好的数据保存到本地。
共有link.py和main.py两个文件,可以去主页下载。
函数部分
首先,引入必要的函数库
import requests
import re
from bs4 import BeautifulSoup
import openpyxl as op
初始化爬虫,返回soup对象
# 初始化爬虫,运用beautifulsoup库返回一个soup
def first(header,urls,timeouts):
res = requests.get(urls, headers=header,timeout=timeouts)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text,'html.parser')
return soup
这个函数是提取网页底部所有日期的链接,方便爬取所有日期的数据
# 返回所有要爬取的连接与月份名字
def link_obtain(soup):
#filename = (soup.title.string.replace('\r','').replace('\n','').replace('\t','')).split('_')[2]
hreflist = soup.find_all('div',class_='months')
hreflist = str(hreflist).split('</a>')
link = []
other_name = []
for i in range(len(hreflist)):
bb = hreflist[i]
link.append(bb[bb.find('href=') + 6:bb.find('title') -2])
other_name.append(bb[bb.find('title') + 7:bb.find('天气') + 2])
return link,other_name
使用soup中的find_all函数,定位tr,提取要爬取的数据部分
# 关键点的提取
def key_obtain(soup):
tem_list = soup.find_all('tr')
return tem_list
将要提取的表格表头写入
# 每个城市数据文件的表头写入
def table_tag_write(tem_list,filename):
wb = op.Workbook() # 创建工作薄对象
ws = wb['Sheet'] # 创建子表
# 表头提取出来了
table1 = re.findall(r'<td>\n[\s]*<b>(.*?)</b></td>',str(tem_list[0]))
for oo in range(len(table1)):
ws.cell(row=1,column = oo + 1).value = table1[oo]
wb.save(filename + '.xlsx')
return wb,ws
将表格中间的数据部分写入
# 数据文件中间部分的写入
def table_body_write(tem_list,local,filename):
wb = op.load_workbook(filename + '.xlsx') # 打开MY_EXCEL.xlsx文件
ws = wb.active # 激活工作区
for i in range(1,len(tem_list)):
time = str(tem_list[i].a.string).replace(' ','').replace('\n','')
aa = (str(tem_list[i]).replace(' ','')).split("<td>")
for j in range(len(aa)):
aa[j] = aa[j].replace('\r','')
aa[j] = aa[j].replace('\n','')
aa[j] = aa[j].replace('</td>','')
aa[j] = aa[j].replace('</tr>','')
aa[1] = time
aa = aa[1:]
for k in range(len(aa)):
ws.cell(row=local + i + 1,column=k+1).value = aa[k] # 将数据data写入excel中的第i行第j列
local = local + len(tem_list)-1
wb.save(filename + '.xlsx') # 保存excel表
return local
主函数部分
主函数用来调用函数来实现爬取信息
# 这个是调用的爬虫主函数
import link
# 头
headers = {
"referer": "https://www.baidu.com.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}
# 爬取南昌天气部分,要提取哪个城市只需要把城市的名字替换即可
url= 'http://www.tianqihoubao.com/lishi/nanchang/month/202001.html'
timeouts = 60 # 最大的加载时间
filename = "nanchang_weather_1" # 保存的文件名
# 这个部分是记录表头的
soup_link_changsha = link.first(headers,url,timeouts)
link_changsha = link.link_obtain(soup_link_changsha)[0]
other_name_changsha = link.link_obtain(soup_link_changsha)[1]
tem_list_changsha = link.key_obtain(soup_link_changsha)
link.table_tag_write(tem_list_changsha,filename)
local = 0
# 开始循环写入表的具体内容
try:
for kk in range(len(link_changsha)):
#for kk in range(3):
print(kk)
url_kk = 'http://www.tianqihoubao.com' + link_changsha[kk]
soup_changsha_kk = link.first(headers,url_kk,timeouts)
tem_list_changsha_kk = link.key_obtain(soup_changsha_kk)
local = link.table_body_write(tem_list_changsha_kk,local,filename)
except Exception as e: # 如果发生错误,就继续循环
print('http://www.tianqihoubao.com' + link_changsha[kk])
print(e)
lim = kk +1
for kk in range(lim,len(link_changsha)):
#for kk in range(3):
print(kk)
url_kk = 'http://www.tianqihoubao.com' + link_changsha[kk]
soup_changsha_kk = link.first(headers,url_kk,timeouts)
tem_list_changsha_kk = link.key_obtain(soup_changsha_kk)
local = link.table_body_write(tem_list_changsha_kk,local,filename)