# -*- coding: utf-8 -*-
"""
Created on 2021/1/7 20:18
下载ruanjianxuebao论文并保存成PDF
@author: 本文大佬
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import os
import re
import requests
import urllib.request
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
# 列表去重
def string_duplicate_4(iterable):
seen = set()
result = []
for item in iterable:
if item not in seen:
seen.add(item)
result.append(item)
return result
# 获取期刊号
def getnum(url):
num = url.index('quarter_id=')
return url[num + 11:].replace("\n", "")
def getIssue_browser(issueurl, issuepath):
browser = webdriver.Chrome()
urlsets = set()
try:
browser.get(issueurl) # 软件学报各期目录网址
wait = WebDriverWait(browser, 10) # 等待页面加载完成
wait.until(EC.presence_of_all_elements_located((By.ID, 'Map')))
time.sleep(3)
rso = browser.find_element_by_xpath("//table[@id='QueryUI']") # 一个节点是可以继续查找的
links = rso.find_elements_by_tag_name('a') # 获取全部a标签
urls = list(links)
for r in urls:
if r != None:
url = r.get_attribute('href')
urlsets.add('\n' + url)
finally:
browser.close()
print('开始获取链接', end='')
fo = open(issuepath, 'a+', encoding='utf-8')
for re in urlsets:
print('.', end='')
# 保存到文件
fo.write(re)
fo.close()
print("\n列表页链接抓取完毕!")
# 解析pdf下载链接
def getPdfUrl(path, haspath, startyear, endyear, knowtime, downtime):
urlsets = set()
# 开始读入全部链接
with open(path, "r") as f:
iter_f = f.readlines()
# 获取已下载链接
hasdown_f = []
if os.path.exists(haspath):
with open(haspath, "r") as f2:
hasdown_f = f2.readlines()
for line in iter_f:
if len(line) > 5: # 去掉首行的空行
# 判断url是否已经被使用
tag = 0
for hasdownline in hasdown_f:
if (hasdownline == line):
tag = 1
if tag == 0:
myyear = re.findall(r"year_id=(.+?)&", line) # 获取链接中的年份
myqi = getnum(line) # 获取链接中的期刊号
# 判断年份
if int(myyear[0]) >= startyear and int(myyear[0]) <= endyear:
browser = webdriver.Chrome()
try:
browser.get(line) # 打开软件学报目录列表页
wait = WebDriverWait(browser, 10) # 等待页面加载完成
wait.until(EC.presence_of_all_elements_located((By.ID, 'table3')))
time.sleep(knowtime)
rso = browser.find_element_by_xpath("//table[@id='table3']") # 一个节点是可以继续查找的
links = rso.find_elements_by_tag_name('a') # 获取全部a标签
lst1 = list(links)
urls = string_duplicate_4(lst1)
pdfnamelist = list()
pdfurllist = list()
for r in urls:
url = r.get_attribute('href')
if url != None and url.startswith('http://www.jos.org.cn/jos/ch/reader/view_abstract.aspx'):
# print(r.text)
pdfnamelist.append(r.text)
if url != None and url.startswith('http://www.jos.org.cn/jos/ch/reader/create_pdf.aspx'):
# print(r.text)
pdfurllist.append(url)
x = 0
for i in range(len(pdfurllist)):
pdfpath = 'D:\\软件学报\\pdfFile\\' + myyear[0] + '年第' + myqi + '期论文\\'
pdfname = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "",
pdfnamelist[x])
x = x + 2
print(pdfurllist[i])
print(pdfname)
downPDF(pdfurllist[i], pdfpath, pdfname + '.pdf', downtime)
finally:
browser.close()
# 判断年份
myyear = re.findall(r"year_id=(.+?)&", line) # 获取链接中的年份
# 记录已下载的论文期刊数,防止爬虫中途短线,丢失处理记录
fo = open(haspath, 'a+', encoding='utf-8')
fo.write(line)
fo.close()
print("\n执行完毕!")
else:
print("\n该下载链接不在约定的年限中!")
else:
continue
else:
continue
f.close()
# 下载pdf
def downPDF(pdfurl, pdfpath, pdfname, downtime):
if os.path.exists(pdfpath):
pass
else:
os.mkdir(pdfpath)
# 下载pdf到指定文件夹下
r = requests.get(pdfurl, headers=header)
with open(str(pdfpath + pdfname), "wb") as code:
code.write(r.content)
time.sleep(downtime)
if __name__ == '__main__':
# 定义常量 后面可以改成读取配置文件
print("start------------------")
issueurl = 'http://www.jos.org.cn/jos/ch/reader/issue_browser.aspx' # 软件学报各期目录网址
issuepath = 'D:\\软件学报\\path\\firsturl.txt' # 定义软件学报各期目录链接的txt文件地址
haspath = 'D:\\软件学报\\path\\hasdownurl.txt' # 定义PDF已下载的链接记录的txt文件地址
startyear = 2015 # 从哪一年开始下载
endyear = 2021 # 到哪一年结束下载
knowtime = 10 # 等待网页相应时间(单位:秒)
downtime = 10 # 论文PDF下载时间间隔(单位:秒)
# 1、抓取软件学报各期目录网址,保存到本地
getIssue_browser(issueurl)
# 2、抓取软件学报pdf下载链接并下载PDF
getPdfUrl(issuepath, haspath, startyear, endyear, knowtime, downtime)
print("end------------------")
亲测可用。
环境:Windows10 + anaconda + python3.7+其他环境(根据缺什么安装什么的原则,自己下吧,又不发钱,都不知道写这个为了什么:( ,略略略)