程序运行截图:
mysql代码:
CREATE TABLE `article` (
`id` int(11) NOT NULL,
`article_time` varchar(50) DEFAULT NULL,
`article_volume` varchar(20) DEFAULT NULL,
`article_author` varchar(2000) DEFAULT NULL,
`article_name_english` varchar(2000) DEFAULT NULL,
`article_name_chinese` varchar(2000) DEFAULT NULL,
`article_content_english` varchar(5000) DEFAULT NULL,
`article_content_chinese` varchar(2000) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
python代码:
import random
import re
import requests
import pymysql
# 打开数据库连接
db = pymysql.connect(host='localhost',
port=8080,
user='root',
passwd='123',
db='students',
charset='utf8')
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# 功能:获取历年的论文文献名中英文,作者名,摘要中英文,时间
# 翻译接口 parm: content is english
def translator_chinese(content):
"""英文翻译成中文"""
Tranlator_URL = "http://fy.iciba.com/ajax.php?a=fy&f=en&t=zh-CHS&w=%s" % ('"' + content + '"')
urls = re.findall(r'"out":"(.*?)","ci', requests.get(Tranlator_URL).text, re.S)
if len(urls) > 0:
result = (urls[0].encode('ascii').decode('unicode_escape')).replace("“", "").replace("”", "")
return result
else:
return ""
# for test
# print(translator_chinese(" therefore, be treated as a unity of contradictions."))
#通过年份获取数据
def get_data(year):
"""通过年份获取文献卷宗"""
JZ_URL = "https://journals.sagepub.com/loi/oss?year=%i" % year
respose = requests.get(JZ_URL)
print("*" * 300)
print("开始爬取%s年的文献数据!" % year)
# 获取卷宗
jz = (re.findall(r'class="expander".*?data-attr-vol="(.*?)"', respose.text, re.S))[1]
print("卷宗:" + jz)
# 获取文献url
article_ml = re.findall(r'class="row js_issue".*?href="(.*?)"', respose.text, re.S)
print("文献目录地址:")
for i in range(0, len(article_ml)):
print(str(i + 1) + "." + article_ml[i])
print("*" * 300)
for temp in article_ml:
data = requests.get(temp)
article_time = re.findall(r'
\n(.*?)\n', data.text, re.S)
# 获取文献时间
time = article_time[0][article_time[0].index(",") + 1:len(article_time[0])]
print("文献时间:" + time)
# 获取文献地址
addr = re.findall(r'class="ref nowrap" href="(.*?)"', data.text, re.S)
Basic_URL = "https://journals.sagepub.com"
print("文献列表地址:")
for lb in range(0, len(addr)):
print(str(lb + 1) + "." + addr[lb])
for ad in addr:
# 获取每个文献内容
print("*" * 300)
article_data = requests.get(Basic_URL + ad)
article_c = re.findall(r'property="og:title" content="(.*?)"', article_data.text, re.S)
if len(article_c) > 0:
if "-" in article_c[0]:
# 获取文献作者
article_author = article_c[0][article_c[0].index("-") + 1:len(article_c[0])]
# 获取文献名
article_name_english = article_c[0][0:article_c[0].index("-")]
article_name_chinese = translator_chinese(article_name_english)
print("文献英文名字:" + article_name_english)
print("文献中文名字:" + article_name_chinese)
print("作者名字:" + article_author)
else:
article_author = ""
article_name_english = article_c[0]
article_name_chinese = translator_chinese(article_name_english)
print("文献英文名字:" + article_name_english)
print("文献中文名字:" + article_name_chinese)
print("作者名字:" + article_author)
else:
break
# 获取文献摘要
article_content_data = re.findall(r'
(.*?)',
article_data.text, re.S)
if len(article_content_data) > 0:
article_content_english = article_content_data[0]
article_content_chinese = translator_chinese(article_content_data[0])
print("英文摘要:" + article_content_english) # 英文摘要
print("中文摘要:" + article_content_chinese) # 中文摘要
else:
article_content_english = ""
article_content_chinese = "" # 中英文摘要都为空
print("英文摘要:" + article_content_english) # 英文摘要
print("中文摘要:" + article_content_chinese) # 中文摘要
# 数据写入数据库
id = random.randint(0, 999999999)
sql = """insert into article(id,article_time,article_volume,article_author,article_name_english,article_name_chinese,
article_content_english,article_content_chinese) values(%i,%s,%s,%s,%s,%s,%s,%s) """ % (
id, "'" + time + "'", "'" + jz + "'", "'" + pymysql.escape_string(article_author) + "'",
"'" + pymysql.escape_string(article_name_english) + "'",
"'" + pymysql.escape_string(article_name_chinese) + "'",
"'" + pymysql.escape_string(article_content_english) + "'",
"'" + pymysql.escape_string(article_content_chinese) + "'")
cursor.execute(sql)
# # 提交到数据库执行
print("id:%i数据爬取成功!" % id)
db.commit()
# 主函数
if __name__ == '__main__':
for year in range(2015, 2017):
get_data(year)
else:
print("数据爬取完成!")
db.close()
程序可能存在部分bug,欢迎交流指正。