项目Github地址
每个会议有其独特之处,在定义完基类之后,我们为每个会议单独写一个类,定义其特有的函数,这些子类都继承自基类。由于AAAI、IJCAI对应的会议都来自各自的官网,所以各自单独写一个类,而ACL系列所有的会议都来自一个网站,所以可以统一写一个类。在各个子类中定义如何在各自论文页面中提取论文pdf下载链接以及论文标题的方法。
- ACLSeries
class ACLSeries(BasicSpider):
def __init__(self,opt):
super(ACLSeries, self).__init__()
self.opt = opt
def get_content(self,url,year):
page = self.get_page(url)
soup = BeautifulSoup(page, 'lxml')
tag = soup.select('#title a')[0]
pdf_url = tag['href']
title = tag.get_text().strip()
print("论文pdf链接:" + str(pdf_url))
if '/' in title:
title = title.replace('/','or')
print("论文标题:"+str(title))
self.saveFile(pdf_url,title,year)
- IJCAI
class IJCAI(BasicSpider):
def __init__(self,opt):
super(IJCAI, self).__init__()
self.opt = opt
def get_content(self,url,year):
page = self.get_page(url)
soup = BeautifulSoup(page, 'lxml')
pdf_url = soup.select('.btn-download')
if pdf_url==[]:
pattern = re.compile('<p><a href="(.*?)">PDF</a></p>',re.S)
pdf_url = pattern.findall(page)[0]
pdf_url = 'https://www.ijcai.org'+pdf_url
else:
pdf_url = pdf_url[0]['href']
print("论文pdf链接:"+str(pdf_url))
pattern = re.compile('<h1>(.*?)</h1>',re.S)
res = pattern.findall(page)
if res == []:
pattern = re.compile('<p>(.*?)<br />.*?<i>.*?</i>.*?</p>',re.S)
res = pattern.findall(page)
title = res[0].strip()
if '/' in title:
title = title.replace('/','or')
print("论文标题:"+str(title))
self.saveFile(pdf_url,title,year)
- AAAI
class AAAI(BasicSpider):
def __init__(self,opt):
super(AAAI, self).__init__()
self.opt = opt
def get_content(self,url,year):
if year == 2019:
page = self.get_page(url)
soup = BeautifulSoup(page, 'lxml')
pdf_url = soup.select('.pdf')
title = soup.select('.page_title')
pdf_url = pdf_url[0]['href']
title = title[0].get_text().strip()
else:
url = url.replace('view', 'viewPaper')
while True:
try:
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# browser = webdriver.Chrome(executable_path="E:\\chromedriver",chrome_options=chrome_options)
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get(url)
browser.implicitly_wait(10)
pdf_url = browser.find_element_by_css_selector('#paper a')
pdf_url = pdf_url.get_attribute('href')
pdf_url = pdf_url.replace('view', 'viewFile')
title = browser.find_element_by_css_selector('#title')
title = title.text.strip()
if browser:
browser.close()
break
except NoSuchElementException:
if browser:
browser.close()
print('selenium fail,重试。')
continue
'''
page = get_page(url)
soup = BeautifulSoup(page, 'lxml')
pdf_url = soup.select('#paper a')
pdf_url = pdf_url[0]['href']
pdf_url = pdf_url.replace('view', 'viewFile')
print(pdf_url)
title = soup.select('#title')
title = title[0].get_text().strip()
'''
if '/' in title:
title = title.replace('/', 'or')
print("论文pdf链接:" + str(pdf_url))
print("论文标题:" + str(title))
self.saveFile(pdf_url,title,year)