ACM&IEEE期刊会议论文关键词爬虫

需要如作者、摘要等其它内容可以通过xpath获取

ACM

#ACM会议论文的关键词隐藏于bibtex当中
import os
import re
import time
import random
import pandas as pd
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import Chrome, ChromeOptions
from openpyxl import load_workbook
from pybtex.database.input import bibtex
from selenium.webdriver.common.action_chains import ActionChains

#获取关键词
def getkw(paper):
   #打开论文页面,paper为论文url
   driver.get(paper)
   time.sleep(random.uniform(4, 4))
   #通过xpath定位到bibtex按键
   xp = '#pb-page-content > div > main > div.container > article > div:nth-child(1) > div.col-md-8.col-sm-7 > div > div > div:nth-child(6) > div > div.issue-item__footer-links.pull-right > ul:nth-child(1) > li:nth-child(4) > a > i'
   driver.get(paper)
   time.sleep(random.uniform(4, 4))
   html = etree.HTML(driver.page_source)
   #bibtex内容在伪标签当中,只有css模拟鼠标点击按键之后才能获取内容
   si = driver.find_element(By.CSS_SELECTOR,xp)
   ActionChains(driver).click(si).perform()
   time.sleep(random.uniform(4, 4))
   html = etree.HTML(driver.page_source)
   #模拟点击后通过xpath获取bibtex内容
   pmsg = html.xpath('//div[@class="csl-right-inline"]/text()')
   #分析bibtex内容,获取标题和关键字
   parser = bibtex.Parser()
   try:
      bibdata = parser.parse_string(pmsg[0])
      for bib_id in bibdata.entries:
         try:
            b = bibdata.entries[bib_id].fields
            data_pack = {
               "title": b["title"],
               "keywords": b["keywords"]
            }
            datas.append(data_pack)
         except(KeyError):
            continue
   except:
       print("error")
   driver.back()
   print("\033c", end="")
   time.sleep(random.uniform(2, 4))

def main():
   global datas
   # 遍历所有论文
   html = etree.HTML(driver.page_source)
   # 获取所有论文url
   signs = html.xpath('//img[@src="https://dblp.uni-trier.de/img/paper.dark.hollow.16x16.png"]')
   del signs[0]
   i = 1
   for s in signs:
       paper = s.xpath('./parent::a//@href')
       getkw(paper[0])
       write_to_exist_excel2(datas,i)
       datas = []
       i = i + 1

# 将list[dict]类型的数据追加写入到现有的Excel中
def write_to_exist_excel2(data_added, i):
    abs_path = os.path.dirname(os.path.abspath(__file__))
    fileName = os.path.join(abs_path, 'test.xlsx')
    sheetName = "Sheet1"
    df_old = pd.DataFrame(pd.read_excel(fileName, sheet_name=sheetName))  # 读取原数据文件和表
    row_old = df_old.shape[0]  # 获取原数据的行数

    df = pd.DataFrame(data_added)

    book = load_workbook(fileName)
    writer = pd.ExcelWriter(fileName, engine='openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    # 将data_added数据写入Excel中
    df.to_excel(writer, sheet_name=sheetName, startrow=row_old + 1, index=False, header=False)
    print(row_old)

    writer.save()  # 保存


if __name__ == "__main__":
   # 加载chromedriver,此处路径需根据实际情况调整
   options = ChromeOptions()
   #options.add_argument('--disable-software-rasterizer')
   #options.add_argument('--disable-gpu')
   #options.add_argument('--headless') 
   options.add_argument('-ignore-certificate-errors')
   options.add_argument('-ignore-ssl-errors')
   driver_path = r"C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chromedriver.exe"
   driver = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
   
   # 进入页面
   url = 'https://dblp.uni-trier.de/db/conf/ccs/ccs2022.html'
   driver.get(url)

   # 战术停顿
   time.sleep(3)
   datas = []
   # 主体函数
   main()
   # 存储数据
   
   # 关闭浏览器
   driver.close()
   # 关闭chromedriver进程

IEEE

#IEEE会议论文大多只有出版社提供的,期刊论文有自己的关键词
import os
import re
import time
import random
import pandas as pd
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import Chrome, ChromeOptions
from openpyxl import load_workbook

#获取关键词
def getkw(paper):
   #打开论文页面,paper为论文url
   driver.get(paper)
   time.sleep(random.uniform(2, 4))
   #通过xpath定位到关键词按键
   driver.execute_script("$(arguments[0]).click()", driver.find_element(By.ID,"keywords-header"))
   html = etree.HTML(driver.page_source)
   #模拟点击后通过xpath获取标题与关键词内容
   title=html.xpath('//h1[@class="document-title text-2xl-md-lh"]//text()')
   title_text = ''.join(title).strip()
   kws=html.xpath('//ul[@class="doc-keywords-list stats-keywords-list"]/li[contains(string(),"IEEE Keywords")]/ul//text()')
   kws_text=''.join(kws).strip()
   #print(kws_text)
   data_pack = {
           "title": title,
           "keywords": kws_text
       }
   datas.append(data_pack)
   driver.back()
   time.sleep(random.uniform(2, 4))

def main():
   global datas
   # 遍历所有论文
   html = etree.HTML(driver.page_source)
   # 获取所有论文url
   signs=html.xpath('//img[@src="https://dblp.uni-trier.de/img/paper.dark.hollow.16x16.png"]')
   del signs[0]
   i=1
   for s in signs:
       try:
          paper=s.xpath('./parent::a//@href')
          getkw(paper[0])
          write_to_exist_excel2(datas,i)
          datas = []
          i=i+1
       except:
          continue

# 将list[dict]类型的数据追加写入到现有的Excel中
def write_to_exist_excel2(data_added, i):
    abs_path = os.path.dirname(os.path.abspath(__file__))
    fileName = os.path.join(abs_path, 'test.xlsx')
    sheetName = "Sheet1"
    df_old = pd.DataFrame(pd.read_excel(fileName, sheet_name=sheetName))  # 读取原数据文件和表
    row_old = df_old.shape[0]  # 获取原数据的行数

    df = pd.DataFrame(data_added)

    book = load_workbook(fileName)
    writer = pd.ExcelWriter(fileName, engine='openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    # 将data_added数据写入Excel中
    df.to_excel(writer, sheet_name=sheetName, startrow=row_old + 1, index=False, header=False)

    writer.save()  # 保存


if __name__ == "__main__":
   # 加载chromedriver,此处路径需根据实际情况调整
   chrome_options = ChromeOptions()
   #chrome_options.add_argument('--disable-software-rasterizer')
   #chrome_options.add_argument('--disable-gpu')
   #chrome_options.add_argument('--headless') 
   driver_path = r"C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chromedriver.exe"
   driver = webdriver.Chrome(executable_path=driver_path)
   
   # 进入页面
   url = 'https://dblp.uni-trier.de/db/conf/sp/sp2022w.html'
   driver.get(url)

   # 战术停顿
   time.sleep(3)
   datas = []
   # 主体函数
   main()
   # 存储数据
   
   # 关闭浏览器
   driver.close()
   # 关闭chromedriver进程

注意会议论文第一列不是论文
会议
期刊

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值