概述
近些年,随着互联网行业快速发展,社会信息量呈爆炸式增长,如何深层次地发掘出网页数据源价值,建立可视化分析系统,是当前研究的热点和难点。
中国工程院网站是一个全国著名的学术网站,里面包含着成千上万的工程院院士信息。中国工程院院士作为学术带头人和青年科学家的引路人,绝大多数院士无论是学术还是学风,都堪称典范。
我们国家从积贫积弱到繁荣富强,科技从落后到逐渐发达,离不开中国工程院院士们的贡献。可以说,没有这些院士精英们,我们很难跟上发达国家的脚步。而对于这些院士们,我们却很少去深入的了解他们,所以,本次课程设计基于python爬取工程院院士的设计与实现对于我们来说具有很强的教育意义。
通过爬取中国工程院院士的信息,我们得以了解工程院院士的生平经历、研究成果、求知历程,更能使我们深刻的了解是谁,是哪一群抱着一颗赤诚之心的人,为了国家的科学事业鞠躬尽瘁,为我们的美好生活添砖加瓦。
设计任务:
编写网络爬虫程序,爬取中国工程院网页(http://www.cae.cn/cae/html/main/col48/column_48_1.html)上每位院士的信息。
要求:
将每位院士的建立存为本地文件,把每位院士的照片保存为本地图片,文本文件和图片文件都以院士的姓名为文件名。并将院士每个人的姓名、简介、国籍、户籍、性别、当选日期、等信息写入csv中方便上传MySQL数据库
爬取内容:
代码:
import os
import csv
import pandas as pd
import requests
import re
from urllib.request import urlopen
import pymysql
from tkinter import *
from PIL import Image, ImageTk
start_url = 'http://www.cae.cn/cae/html/main/col48/column_48_1.html'
response = requests.get(start_url)
response.encoding = 'utf-8'
# 创建文件夹
def maker(path):
folder = os.path.exists(path)
if not folder:
os.makedirs(r'./Bigdata_spider_file/{}'.format(path))
else:
print("”{}“文件夹已存在".format(path))
# 每个院士的url
everyone_url = re.findall('<a href="/cae/html/main/colys/(\d+).html" target="_blank">', response.text)
count = 1
name = []
number = 1
# 创建相应的csv,并写入列名
f = open('Academician_data.csv', 'w', encoding='utf-8', newline=''"")
csv_writer = csv.writer(f)
csv_writer.writerow(["姓名", "民族", "性别", "当选日期", "国籍", "籍贯", "photo_URL", "介绍"])
for person in everyone_url[:]:
# 每一位院士的url
person_url = 'http://www.cae.cn/cae/html/main/colys/{}.html'.format(person)
# 发起请求下载网页
person_response = requests.get(person_url)
person_response.encoding = 'utf-8'
# 获取院士简介
person_text = re.findall('<div class="intro">(.*?)</div>', person_response.text, re.S)
person_intro = re.sub(r'<p>| | |</p>', '', person_text[0]).strip()
# 院士名字
person_name = re.findall('<div class="right_md_name">(.*?)</div>', person_response.text)[0]
# name[number] = person_name
number += 1
person_name = '{}.'.format(count) + person_name
# 院士照片
photo = r'<img src="/cae/admin/upload/img/(.+)" style='
result = re.findall(photo, person_response.text, re.I)
# 存储到文件夹和csv
# 个人院士馆url
academician_url = re.findall('<div class="cms_ysg_title"><a href="(.*?)" target="_blank">', person_response.text)
academician_response = requests.get(academician_url[0])
academician_response.encoding = 'utf-8'
academician_text = re.findall('<div class="particular_info">([\s\S]*)</div>', academician_response.text, re.S)
# 因为院士馆有两种不同样式的网页所以我们需要一个判断
#
if not academician_text:
# 名字
academician_name = re.findall(r'<div><h4 class="text_justify">姓名</h4><span>:</span>(.*?)</div>',
academician_response.text)
# 民族
academician_nation = re.findall(r'<div><h4 class="text_justify">民族</h4><span>:</span>(.*?)</div>',
academician_response.text)
# 性别
academician_gender = re.findall(r'<div><h4 class="text_justify">性别</h4><span>:</span>(.*?)</div>',
academician_response.text)
# 国籍
academician_nationality = re.findall(r'<h4 class="text_justify">国籍</h4><span>:</span>\s*<div '
r'class="country_flag">\s*<img style="width: 30px" '
r'class="img_fit_cover".*>\s*</div>\s*(\S+)', academician_response.text)
# 籍贯
text_native_place = re.findall(r'<h4 class="text_justify">籍贯</h4><span>:</span>\s*([^<]+)',
academician_response.text)
# 爬取汉字最短三个字,截止七个字(不超过七个字)
list1 = re.findall(r'[\u4e00-\u9fa5]{3,7}', text_native_place[0])
academician_native_place = list1[:1]
list2 = re.findall(r'\b\d{4}年\b', academician_response.text)
academician_year = list2[:1]
else:
# 名字
academician_name = re.findall(r'<div><h4 class="row">姓名</h4><span>:</span><h4>(.*?)</h4></div>',
academician_response.text)
# 民族
academician_nation = re.findall(r'<div><h4 class="row">民族</h4><span>:</span><h4>(.*?)</h4></div>',
academician_response.text)
# 性别
academician_gender = re.findall(r'<div><h4 class="row">性别</h4><span>:</span><h4>(.*?)</h4></div>',
academician_response.text)
# 国籍
academician_nationality = re.findall(r'<h4 class="row">国籍</h4><span>:</span>\s*<div '
r'class="country_flag">\s*<img class="img_fit_cover".*>\s*</div>\s*('
r'\S+)', academician_response.text)
# 籍贯<div><h4 class="row">籍贯</h4><span>:</span>\s*<h4>([\s\S]*)</h4>\s*</div>
text_native_place = re.findall(r'<div><h4 class="row">籍贯</h4><span>:</span>\s*<h4>([\s\S]*)</h4>\s*</div>',
academician_response.text)
# 爬取汉字最短三个字,截止七个字(不超过七个字)
list1 = re.findall(r'[\u4e00-\u9fff]{3,7}', text_native_place[0])
academician_native_place = list1[:1]
list2 = re.findall(r'\b\d{4}年\b', academician_response.text)
academician_year = list2[:1]
# 创建个人文件夹
maker(person_name)
# 写入介绍
with open('./Bigdata_spider_file/{}/{}.text'.format(person_name, person_name), mode='a+', encoding='utf-8') as f:
f.write('{}.'.format(count) + person_intro + '\n')
# 写入照片
if result:
picurl = r'http://www.cae.cn/cae/admin/upload/img/{0}'.format(result[0].replace(' ', r'%20'))
with open('./Bigdata_spider_file/{}/{}.jpg'.format(person_name, person_name), 'wb') as fpic:
fpic.write(urlopen(picurl).read())
count += 1
# 存储到csv
csv_writer.writerow(["{}".format(academician_name[0]), "{}".format(academician_nation[0]), "{}".format(academician_gender[0]), "{}".format(academician_year[0]),"{}".format(academician_nationality[0]), "{}".format(academician_native_place[0]), "{}".format(picurl), "{}".format(person_intro), ])
print("写入成功")
f.close()
# 以下部分是上传到数据库但是好像要先建表(有点久给忘了)还有就是一个简单的展示台
"""# Mysql_Store
try:
connect = pymysql.connect(
host="localhost",
port=3306,
user="root",
password="root",
database="bigdata",
charset="utf8"
)
cursor = connect.cursor()
print("连接成功")
except:
print("连接失败")
cursor.execute("drop table if exists academician_data")
sql_str = ("create table if not exists academician_data(name varchar(20),nation varchar(20),gender varchar(20),"
"nationality varchar(20),native_place varchar(20),photo_URL varchar(100),intro varchar(200));")
cursor.execute(sql_str)
data = open('Academician_data.csv', 'r', encoding='utf-8')
while True:
data_header = data.readline()
if not data_header:
break;
list1 = data_header.split(",")
# print(list1)
sql_str = ("insert into academician_data(name,photo_URL,intro) values('{0}','{1}','{2}','{3}','{4}','{5}','{6}',"
"'{7}');").format(list1[0], list1[1], list1[2], list1[3], list1[4], list1[5], list1[6], list1[7])
cursor.execute(sql_str)
connect.close()
data.close()
# 展示台部分
# visual
def show(test):
txt = test.get()
print(txt)
def visual_window(data):
name = data.get()
window = Toplevel()
window.title("{}院士信息展示".format(name))
window.geometry('550x350+360+200')
window.resizable(False, False)
image = Image.open("Bigdata_spider_file/{}/{}.jpg".format(name, name))
image = image.resize((120, 168))
img = ImageTk.PhotoImage(image)
photo = Label(window, image=img)
photo.place(x=20, y=15)
with open("Bigdata_spider_file/{}/{}.text".format(name, name), "r", encoding='utf-8') as f: # 打开文本
data = f.read() # 读取文本
text = Label(window, text=data, wraplength=330)
text.place(x=150, y=100)
window.mainloop()
# 通过Tk()方法建立一个根窗口
top = Tk()
top.title("院士信息数据可视化平台")
top.geometry('750x450+360+200')
# top.iconbitmap("logo.ico")
top.resizable(False, False)
Label(top, text="请输入查看数据:").place(x=310, y=170)
name = Entry(top)
name.place(x=310, y=190)
# 创建按钮,text是功能按钮的名称
Button(top, text="确认", bg="white", command=lambda: visual_window(name)).place(x=450, y=185)
Button(top, text="退出", bg="white", command=lambda: top.quit()).place(x=490, y=185)
# 进入等待处理窗口事件
top.mainloop()"""
希望我的文章能够帮到你,有帮到你的话点点赞,谢谢!!!