启动类
根据分项一级级抓取
package com.rhhz;
import java.io.File;
import java.io.FileFilter;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import javax.persistence.EntityManager;
import javax.persistence.EntityManagerFactory;
import javax.persistence.Query;
import javax.servlet.ServletContextEvent;
import javax.servlet.ServletContextListener;
import javax.servlet.annotation.WebListener;
import org.springframework.web.context.WebApplicationContext;
import org.springframework.web.context.support.WebApplicationContextUtils;
import com.rhhz.core.element.metadata.bean.Article;
import com.rhhz.core.element.metadata.bean.Journal;
import com.rhhz.core.element.metadata.bean.JournalCatalog;
import com.rhhz.datatransfer.cnki.CnkiArticleMetaByJournalSipder;
@WebListener
public class SystemInitListenerCnki implements ServletContextListener{
@Override
public void contextDestroyed(ServletContextEvent sce) {
}
private static WebApplicationContext webApplicationContext;
@Override
public void contextInitialized(ServletContextEvent event) {
webApplicationContext = WebApplicationContextUtils.getWebApplicationContext(event.getServletContext());
EntityManagerFactory entityManagerFactory = (EntityManagerFactory) webApplicationContext.getBean("entityManagerFactory");
EntityManager entityManager = entityManagerFactory.createEntityManager();
String journalURL = "https://navi.cnki.net/knavi/JournalDetail?pcode=CJFD&pykm=SPKJ";
String journalPath = "D:\\webDriver\\SPKJ";
//1.获取期刊目次论文链接
//new CnkiArticleMetaByJournalSipder().spiderArticleLinks(journalURL,journalPath,2017,24,"",0);
//2.创建期刊目次
//createJournalCatalog(entityManager, journalPath, journalURL);
/************************************************************************************************************************/
//3.【1】论文元数据页抓取
//List<JournalCatalog> catalogList = entityManager.createQuery("FROM "+JournalCatalog.class.getName()+" where year <2018 and remark is NULL ORDER BY year DESC,issue DESC").getResultList();
//handleMetaArticle(entityManager, journalPath, journalURL, catalogList);
//3.【2】论文HTML页抓取
List<JournalCatalog> catalogList = entityManager.createQuery("FROM "+JournalCatalog.class.getName()+" where year <2018 ORDER BY year DESC,issue DESC").getResultList();
handleHtmlArticle(entityManager, journalPath, journalURL, catalogList);
//4.【3】论文PDF处理:1.知网抓取及处理;2.自定义PDF重命名及处理
//String pdfPath = "E:\\\\SPKJ_PDF\\\\";//java\需要转义,SQL\需要转义
//handleCnkiPDF(entityManager, pdfPath);
//####重命名
//String sqlPattern = "select concat('ren "+pdfPath+"',right(article_year,4),'-',article_issue,'-',article_fpage,'.pdf ',article_publisher_id,'.pdf') from meta_article ";
//####分类存到对应目次路径
//String sqlPattern = "select concat('move "+pdfPath+"',article_publisher_id,'.pdf ','"+pdfPath+"',article_year,'\\\\',article_issue,'\\\\PDF\\\\') from meta_article ";
//handleCustomPDF(entityManager, pdfPath,sqlPattern);
System.out.println("程序执行完毕!!!");
}
public void createJournalCatalog(EntityManager entityManager,String journalPath,String journalURL) {
File journalDir = new File(journalPath);
File[] issueFiles = journalDir.listFiles(new FileFilter() {
@Override
public boolean accept(File pathname) {
if(pathname.getName().startsWith("yq"))
return true;
return false;
}
});
String JournalPublisherId = "";
if(journalURL.indexOf("pykm=") != -1) {
//https://navi.cnki.net/knavi/JournalDetail?pcode=CJFD&pykm=LCGD
JournalPublisherId = journalURL.substring(journalURL.indexOf("pykm=")+5);
if(JournalPublisherId.contains("&")) JournalPublisherId = JournalPublisherId.substring(0, journalURL.indexOf("&")+1);
}
Journal journal = (Journal) entityManager.createQuery("FROM "+Journal.class.getName()).getSingleResult();
List<JournalCatalog> catalogList = new ArrayList<JournalCatalog>();
JournalCatalog catalog = null;
for (int i = 0; i < issueFiles.length; i++) {
String id = issueFiles[i].getName().replace(".txt", "");
String catalogStr = issueFiles[i].getName().replaceAll("\\D", "");
catalog = new JournalCatalog();
catalog.setId(id);
catalog.setYear(catalogStr.substring(0, 4));
catalog.setIssue(catalogStr.substring(4));
catalog.setState("0");
catalog.setReleaseState(1);
catalog.setJournalId(journal.getId());
if(!"".equals(JournalPublisherId)) catalog.setCoverImgSrc("journal/img/cover/"+JournalPublisherId+catalogStr+".png");
catalogList.add(catalog);
}
try {
entityManager.getTransaction().begin();
for (JournalCatalog journalCatalog : catalogList) {
entityManager.merge(journalCatalog);
}
entityManager.getTransaction().commit();
} catch (Exception e) {
entityManager.getTransaction().rollback();
}
}
public void handleMetaArticle(EntityManager entityManager,String journalPath,String journalURL,List<JournalCatalog> catalogList) {
//抓取论文元数据
// for (int i = 0; i < catalogList.size(); i++) {
// JournalCatalog catalog = catalogList.get(i);
// Map<String, Object> resultMap = new CnkiArticleMetaByJournalSipder().spiderMetaArticles(entityManager,journalPath,journalURL,catalog);
// if(resultMap == null) return;
//
// try {
// //抓取完成后,验证目次是否已被更新过论文数据,“1”表示已存储并跳过
// JournalCatalog journalCatalog = (JournalCatalog) entityManager.createQuery("FROM "+JournalCatalog.class.getName()+" where id=:id").setParameter("id", catalog.getId()).getSingleResult();
// if("1".equals(journalCatalog.getRemark())) continue;
//
// entityManager.getTransaction().begin();
// List<Article> articles = (List<Article>) resultMap.get("articles");
// for(Article article:articles){
// entityManager.merge(article);
// }
// entityManager.getTransaction().commit();
// entityManager.getTransaction().begin();
// List<ArticleBusiness> articleBusinesss = (List<ArticleBusiness>) resultMap.get("articleBusinesss");
// for(ArticleBusiness articleBusiness:articleBusinesss){
// entityManager.merge(articleBusiness);
// }
// entityManager.getTransaction().commit();
//
// entityManager.getTransaction().begin();
// catalog.setRemark("1");
// entityManager.merge(catalog);
// entityManager.getTransaction().commit();
// } catch (Exception e) {
// entityManager.getTransaction().rollback();
// System.out.println(catalog.getYear()+"_"+catalog.getIssue()+"抓取失败!!!");
// }
// }
try {
String sql = "SELECT o FROM "+ Article.class.getName()+ " o WHERE abstractinfo not like '%<%' and abstractinfo not like '%>%' and article_year = 2016 and article_remark1 is null";
Query query = entityManager.createQuery(sql);
List<Article> articles = query.getResultList();
ExecutorService executorService = Executors.newFixedThreadPool(4);
for (int i = 0; i < articles.size(); i++) {
Article article = articles.get(i);
executorService.execute(new Runnable() {
@Override
public void run() {
new CnkiArticleMetaByJournalSipder().updateAbstrat(entityManager, article);
try {
Thread.sleep(1000);
} catch (Exception e) {
// TODO: handle exception
}
}
});
}
} catch (Exception e) {
// TODO: handle exception
}
}
/**
* 抓取知网HTML页面数据
* @param entityManager
* @param journalPath
* @param journalURL
* @param catalogList
*/
public void handleHtmlArticle(EntityManager entityManager,String journalPath,String journalURL,List<JournalCatalog> catalogList) {
//抓取HTML
ExecutorService executorService = Executors.newFixedThreadPool(1);
for (int i = 0; i < catalogList.size(); i++) {
JournalCatalog catalog = catalogList.get(i);
executorService.execute(new Runnable() {
@Override
public void run() {
new CnkiArticleMetaByJournalSipder().spiderArticleHTML(entityManager,journalPath,journalURL,catalog);
}
});
}
}
public void handleCnkiPDF(EntityManager entityManager,String journalPath) {
}
@SuppressWarnings("unchecked")
public void handleCustomPDF(EntityManager entityManager,String filePath,String sqlPattern) {
List<JournalCatalog> catalogList = entityManager.createQuery("FROM "+JournalCatalog.class.getName()+" where year<2018 ORDER BY year DESC,issue DESC").getResultList();
if(catalogList != null) {
for (int i = 0; i < catalogList.size(); i++) {
JournalCatalog catalog = catalogList.get(i);
if(catalog.getRemark().contains("PDF")) continue;
System.out.print("开始处理"+catalog.getYear()+"_"+catalog.getIssue()+" ");
String issuePDFPath = filePath+File.separator+catalog.getYear()+File.separator+catalog.getIssue()+File.separator+"PDF";
File issueFolder = new File(issuePDFPath);
if(!issueFolder.exists()) issueFolder.mkdirs();
//select concat(right(article_year,2),'-',article_issue,'-',article_fpage,'.pdf') from meta_article;
String sql = sqlPattern+" where article_year='"+catalog.getYear()+"' and article_issue='"+catalog.getIssue()+"';";
Query query = entityManager.createNativeQuery(sql);
List<String> list = query.getResultList();
Runtime runtime = Runtime.getRuntime();
for (int j = 0; j < list.size(); j++) {
try {
String cmd = list.get(j);
runtime.exec("cmd.exe /c "+cmd);
} catch (Exception e) {
System.out.println(list.get(j));
}
}
try {
catalog.setRemark(catalog.getRemark()+"-PDF");
entityManager.getTransaction().begin();
entityManager.persist(catalog);
entityManager.getTransaction().commit();
System.out.print("处理完成!!!");
} catch (Exception e) {
entityManager.getTransaction().rollback();
}
System.out.println("");
}
}
}
}
抓取实现
package com.rhhz.datatransfer.cnki;
import java.io.File;
import java.io.FileFilter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.persistence.EntityManager;
import javax.persistence.Query;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.edge.EdgeDriver;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import com.rhhz.core.element.metadata.bean.Article;
import com.rhhz.core.element.metadata.bean.ArticleAffiliation;
import com.rhhz.core.element.metadata.bean.ArticleAuthor;
import com.rhhz.core.element.metadata.bean.ArticleBusiness;
import com.rhhz.core.element.metadata.bean.ArticleFundPrj;
import com.rhhz.core.element.metadata.bean.ArticleKeyword;
import com.rhhz.core.element.metadata.bean.ArticleReference;
import com.rhhz.core.element.metadata.bean.Journal;
import com.rhhz.core.element.metadata.bean.JournalCatalog;
public class CnkiArticleMetaByJournalSipder{
public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36";
public static void main(String[] args) {
//getJournalURL();
new CnkiArticleMetaByJournalSipder().downloadPDF("https://navi.cnki.net/knavi/JournalDetail?pcode=CJFD&pykm=DWXY","ZR","2002","1");
}
public void runtime(String cmd) {
try {
Runtime runtime = Runtime.getRuntime();
runtime.exec(cmd);
runtime.exit(0);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 【1】知网抓取目次下链接保存到 filePath
* @param journalURL 必填参数
* @param filePath 必填参数
* @param skipYear 辅助参数
* @param skipIssue 辅助参数
* @param pageIndex 辅助参数
* @param count 辅助参数
*/
public void spiderArticleLinks(String journalURL,String filePath,int skipYear,int skipIssue,String pageIndex,int count) {
File file = new File(filePath);
File log = new File(filePath+File.separator+"log.txt");
FileOutputStream out = null;
FileOutputStream out2 = null;
try {
if(!file.exists()) file.mkdirs();
if(!log.exists()) log.createNewFile();
} catch (Exception e) {
System.out.println("文件创建失败!!!");
return;
}
List<String> errorIssue = new ArrayList<String>();
List<String> articleLinksByIssue = new ArrayList<String>();
try {
WebDriver driver = null;
ChromeDriverService service = null;
String chromeDriverPath = "D:\\webDriver\\chromedriver.exe";
HashMap<String, Object> chromePrefs = new HashMap<String, Object>();
ChromeOptions chromeOptions = new ChromeOptions();
try {
chromePrefs.put("download.default_directory", "D:\\webDriver");
File chromeDriverFile = new File(chromeDriverPath);
System.setProperty("webdriver.chrome.driver",chromeDriverPath);
chromeOptions.setExperimentalOption("prefs", chromePrefs);
//设置为 headless 模式 (必须)
//chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--disable-gpu");
chromeOptions.addArguments("--no-sandbox");
// 禁止弹出拦截
chromeOptions.addArguments("--disable-popup-blocking");
// 禁止默认浏览器检查
chromeOptions.addArguments("no-default-browser-check");
chromeOptions.addArguments("about:histograms");
chromeOptions.addArguments("about:cache");
chromeOptions.addArguments("--start-maximized");
//创建一个 ChromeDriver 接口
service = new ChromeDriverService.Builder().usingDriverExecutable(chromeDriverFile).usingAnyFreePort().build();
service.start();
} catch (IOException e1) {
e1.printStackTrace();
}
driver = new ChromeDriver(chromeOptions);
driver.get(journalURL);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
List<WebElement> pageEles = null;
try {
pageEles = new WebDriverWait(driver, 10).until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector(".page-list a")));
} catch (Exception e) {
}
if(pageEles != null && !pageEles.isEmpty()) {
Pattern pattern = Pattern.compile("[0-9]+");
List<String> pageList = new ArrayList<String>();
for (int i = 0; i < pageEles.size(); i++) {
String pageStr = pageEles.get(i).getText().trim();
if("".equals(pageStr)) pageStr = pageEles.get(i).getAttribute("innerHTML").trim();
Matcher matcher = pattern.matcher(pageStr);
if(matcher.find()) {
pageList.add(pageStr);
}
}
for (int i = 0; i < pageList.size(); i++) {
try {
pageEles = new WebDriverWait(driver, 10).until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector(".page-list a")));
} catch (Exception e) {
}
String pageStr = pageList.get(i);
//翻页跳转验证页,重新加载
if(!"".equals(pageIndex) && Integer.valueOf(pageStr) < Integer.valueOf(pageIndex)) continue;
for (WebElement pageEle : pageEles) {
String page = pageEle.getText().trim();
if("".equals(page)) page = pageEle.getAttribute("innerHTML").trim();
if(pageList.get(i).equals(page)) {
pageEle.click();
break;
}
}
//跳转验证界面,递归
if(driver.getCurrentUrl().contains("https://navi.cnki.net/knavi/Home/Validate?returnUrl=")) {
count++;
System.out.println("跳转验证界面退出,重新加载! "+count);
driver.manage().deleteAllCookies();
driver.quit();
spiderArticleLinks(journalURL,filePath,9999,99,pageStr,count);
return;
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
boolean flag = false;
WebElement yearIssueEle = driver.findElement(By.id("yearissue+"+i));
List<WebElement> yearEles = yearIssueEle.findElements(By.tagName("dl"));
for (int j = 0; j < yearEles.size(); j++) {
WebElement yearEle = yearEles.get(j).findElement(By.tagName("dt"));
String year = yearEle.getAttribute("innerText").trim();
if(Integer.valueOf(year) > skipYear) continue;
yearEle.click(); //年
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
WebElement issEle = yearEles.get(j).findElement(By.tagName("dd"));
// String isShow = issEle.getCssValue("display");
// if("none".equals(isShow)) continue;
List<WebElement> issueEles = issEle.findElements(By.tagName("a"));
for (int k = 0; k < issueEles.size(); k++) {
try {
WebElement issueEle = issueEles.get(k);
String issue = issueEle.getAttribute("innerHTML").replace("No.", "").trim();
if(Integer.valueOf(year) == skipYear && Integer.valueOf(issue) > skipIssue) continue;
String issueId = issueEle.getAttribute("id");
File issueFile = new File(filePath+File.separator+issueId+".txt");
if(issueFile.exists() && issueFile.length() > 0) continue;
if(!issueFile.exists()) issueFile.createNewFile();
issueEle.click(); //期
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
//跳转验证界面,递归
if(driver.getCurrentUrl().contains("https://navi.cnki.net/knavi/Home/Validate?returnUrl=")) {
count++;
System.out.println("跳转验证界面退出,重新加载! "+count);
driver.manage().deleteAllCookies();
driver.quit();
spiderArticleLinks(journalURL,filePath,Integer.valueOf(year),Integer.valueOf(issue),pageStr,count);
return;
}
out = new FileOutputStream(issueFile);
WebElement catalogContentEle = driver.findElement(By.id("CataLogContent"));
List<WebElement> articleByCatalogEles = catalogContentEle.findElements(By.cssSelector("dd .name a"));
for (WebElement articleEle : articleByCatalogEles) {
String href = articleEle.getAttribute("href");
String parameter = href.substring(href.indexOf("?")); //fileName
String articleUrl = "https://kns.cnki.net/kcms/detail/detail.aspx"+parameter+"\r\n";
out.write(articleUrl.getBytes());
//String artileTitle = articleEle.getText();
//String articleHTMLUrl = "https://kns.cnki.net/KXReader/Detail"+parameter;
//articleLinkList.add(articleUrl);
}
out2 = new FileOutputStream(log,true);
String logStr = year+"_"+issue+":"+articleByCatalogEles.size()+"\r\n";
out2.write(logStr.getBytes());
} catch (Exception e) {
e.printStackTrace();
count++;
System.out.println("文章链接获取失败:【"+count+"】");
}finally {
try {
out.close();
out2.close();
} catch (Exception e2) {
}
}
}
}
}
try {
driver.close();
} catch (Exception e) {
}
}else {
WebElement yearIssueEle = driver.findElement(By.id("yearissue+0"));
List<WebElement> yearEles = yearIssueEle.findElements(By.tagName("dl"));
for (int j = 0; j < yearEles.size(); j++) {
WebElement yearEle = yearEles.get(j).findElement(By.tagName("dt"));
String year = yearEle.getAttribute("innerText").trim();
if(Integer.valueOf(year) > skipYear) continue;
yearEle.click(); //年
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebElement issEle = yearEles.get(j).findElement(By.tagName("dd"));
// String isShow = issEle.getCssValue("display");
// if("none".equals(isShow)) continue;
List<WebElement> issueEles = issEle.findElements(By.tagName("a"));
for (int k = 0; k < issueEles.size(); k++) {
try {
WebElement issueEle = issueEles.get(k);
String issue = issueEle.getAttribute("innerHTML").replace("No.", "").trim();
if(Integer.valueOf(year) == skipYear && Integer.valueOf(issue) > skipIssue) continue;
String issueId = issueEle.getAttribute("id");
issueEle.click(); //期
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
//跳转验证界面,递归
if(driver.getCurrentUrl().contains("https://navi.cnki.net/knavi/Home/Validate?returnUrl=")) {
count++;
System.out.println("跳转验证界面退出,重新加载! "+count);
driver.manage().deleteAllCookies();
driver.quit();
spiderArticleLinks(journalURL,filePath,Integer.valueOf(year),Integer.valueOf(issue),"",count);
return;
}
File issueFile = new File(filePath+File.separator+issueId+".txt");
if(issueFile.exists() && issueFile.length() > 0) continue;
if(!issueFile.exists()) issueFile.createNewFile();
out = new FileOutputStream(issueFile);
WebElement catalogContentEle = driver.findElement(By.id("CataLogContent"));
List<WebElement> articleByCatalogEles = catalogContentEle.findElements(By.cssSelector("dd .name a"));
for (WebElement articleEle : articleByCatalogEles) {
String href = articleEle.getAttribute("href");
String parameter = href.substring(href.indexOf("?")); //fileName
String articleUrl = "https://kns.cnki.net/kcms/detail/detail.aspx"+parameter+"\r\n";
out.write(articleUrl.getBytes());
//String artileTitle = articleEle.getText();
//String articleHTMLUrl = "https://kns.cnki.net/KXReader/Detail"+parameter;
//articleLinkList.add(articleUrl);
}
out2 = new FileOutputStream(log,true);
String logStr = year+"_"+issue+":"+articleByCatalogEles.size()+"\r\n";
out2.write(logStr.getBytes());
} catch (Exception e) {
e.printStackTrace();
count++;
System.out.println("文章链接获取失败:【"+count+"】");
}finally {
try {
out.close();
out2.close();
} catch (Exception e2) {
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
for (String string : articleLinksByIssue) {
System.out.println(string);
}
}
/**
* 【2】知网元数据页面抓取
* @param entityManager
* @param journalPath
* @param journalURL
* @param catalog
* @return
*/
public Map<String,Object> spiderMetaArticles(EntityManager entityManager,String journalPath,String journalURL,JournalCatalog catalog) {
Set<String> publisherIdSet = new HashSet<String>();
//查询目次catalog、journal对象
//JournalCatalog catalog = (JournalCatalog) entityManager.createQuery("FROM "+JournalCatalog.class.getName()+" where year="+year+" and issue="+issue ).getSingleResult();
Journal journal = (Journal) entityManager.createQuery("FROM "+Journal.class.getName()).getSingleResult();
System.out.println("============"+catalog.getYear()+"_"+catalog.getIssue()+"============");
String language = "cn";
Map<String,Object> dataMap = new HashMap<String,Object>();
List<Article> articleList = new ArrayList<Article>();
List<ArticleBusiness> businessList = new ArrayList<ArticleBusiness>();
WebDriver driver = null;
ChromeDriverService service = null;
String chromeDriverPath = "D:\\webDriver\\chromedriver.exe";
HashMap<String, Object> chromePrefs = new HashMap<String, Object>();
ChromeOptions chromeOptions = new ChromeOptions();
try {
chromePrefs.put("download.default_directory", "D:\\webDriver");
File chromeDriverFile = new File(chromeDriverPath);
System.setProperty("webdriver.chrome.driver",chromeDriverPath);
chromeOptions.setExperimentalOption("prefs", chromePrefs);
//设置为 headless 模式 (必须)
chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--disable-gpu");
chromeOptions.addArguments("--no-sandbox");
// 禁止弹出拦截
chromeOptions.addArguments("--disable-popup-blocking");
// 禁止默认浏览器检查
chromeOptions.addArguments("no-default-browser-check");
chromeOptions.addArguments("about:histograms");
chromeOptions.addArguments("about:cache");
chromeOptions.addArguments("--start-maximized");
//创建一个 ChromeDriver 接口
service = new ChromeDriverService.Builder().usingDriverExecutable(chromeDriverFile).usingAnyFreePort().build();
service.start();
} catch (IOException e1) {
e1.printStackTrace();
}
List<String> articleLinkByYearIssue = new ArrayList<String>();
File issueFile = new File(journalPath+File.separator+catalog.getId()+".txt");
if(issueFile.exists()) {
try {
articleLinkByYearIssue = FileUtils.readLines(issueFile);
} catch (Exception e) {
e.printStackTrace();
}
}else{
driver = new ChromeDriver(chromeOptions);
driver.get(journalURL);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
articleLinkByYearIssue = parseArticleHTMLByYearIssue(driver,catalog);
}
driver = new ChromeDriver(chromeOptions);
Map<String, Article> articleMap = new HashMap<String, Article>();
for (int k = 0; k < articleLinkByYearIssue.size(); k++) {
//测试直接抓取最后一条
//if(k != articleLinkByYearIssue.size()-1) continue;
try {
if(driver.toString().contains("null")) driver = new ChromeDriver(chromeOptions);
driver.get(articleLinkByYearIssue.get(k));
String articleTitle = driver.getTitle();
String articleHandle1 = driver.getWindowHandle();
Article metaArticle = sipderArticleMeta(driver, journal, catalog);
String publisherId = metaArticle.getPublisherId();
if(publisherIdSet.contains(publisherId)) {
//页码重复的论文追加序号
while (true) {
/***************************页码重复论文***************************/
if(!publisherId.contains("_")) publisherId = publisherId+"_1";
if(publisherIdSet.contains(publisherId)) {
int count = Integer.parseInt(publisherId.split("_")[1]);
publisherId = publisherId.replace("_"+count, "_"+(count+1));
}else {
metaArticle.setPublisherId(publisherId);
break;
}
}
}
publisherIdSet.add(metaArticle.getPublisherId());
articleMap.put(publisherId, metaArticle);
System.out.println("======SELENIUM="+articleMap.size());
List<WebElement> totalEles = driver.findElements(By.cssSelector(".total-inform span"));
String downCount = totalEles.get(0).getText().replace("下载:", "");
//业务信息
ArticleBusiness articleBusiness = new ArticleBusiness();
articleBusiness.setArticleId(metaArticle.getId());
articleBusiness.setPdfFileName(metaArticle.getId()+".pdf");
articleBusiness.setArticleState("-1");
articleBusiness.setArticleType("1");
articleBusiness.setPdfDownCount(Integer.valueOf(downCount));
articleBusiness.setViewCount(0);
businessList.add(articleBusiness);
} catch (Exception e) {
try {
if(!driver.toString().contains("null")) {
//System.out.println(driver.toString());
driver.quit();
System.out.println(driver.toString());
}
} catch (Exception e2) {
}
// 页码超时,采用Jsoup抓取
CloseableHttpClient client = HttpClients.createDefault();
try {
//http://journal02.magtech.org.cn/Jwk_xddl/CN/volumn/volumn_59.shtml
HttpGet get = new HttpGet(articleLinkByYearIssue.get(k));
get.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
CloseableHttpResponse response = client.execute(get);
HttpEntity entity = response.getEntity();
String pageContent = EntityUtils.toString(entity, "UTF-8");
Document pageDoc = Jsoup.parse(pageContent);
Article article = getArticleMetaJsoup(pageDoc, journal, catalog, articleLinkByYearIssue.get(k));
String publisherId = article.getPublisherId();
if(publisherIdSet.contains(publisherId)) {
//页码重复的论文追加序号
while (true) {
/***************************页码重复论文***************************/
if(!publisherId.contains("_")) publisherId = publisherId+"_1";
if(publisherIdSet.contains(publisherId)) {
int count = Integer.parseInt(publisherId.split("_")[1]);
publisherId = publisherId.replace("_"+count, "_"+(count+1));
}else {
article.setPublisherId(publisherId);
break;
}
}
}
publisherIdSet.add(article.getPublisherId());
articleMap.put(publisherId, article);
System.out.println("=========JSOUP="+articleMap.size());
String downCount = "";
Elements totalEles = pageDoc.select(".total-inform span");
for (Element totalEle : totalEles) {
if(totalEle.text().contains("下载"))
downCount = totalEle.html().replace("下载:", "");
}
//业务信息
ArticleBusiness articleBusiness = new ArticleBusiness();
articleBusiness.setArticleId(article.getId());
articleBusiness.setPdfFileName(article.getId()+".pdf");
articleBusiness.setArticleState("-1");
articleBusiness.setArticleType("1");
if(!"".equals(downCount)) articleBusiness.setPdfDownCount(Integer.valueOf(downCount));
articleBusiness.setViewCount(0);
businessList.add(articleBusiness);
}catch(Exception e2) {
e2.printStackTrace();
}finally {
try {
if(client !=null) client.close();
} catch (IOException e2) {
e2.printStackTrace();
}
}
}
if(k == articleLinkByYearIssue.size() - 1) {
System.out.println("=======================================================");
// for (Entry<String, Article> set : articleMap.entrySet()) {
// System.out.println(set.getKey()+"======="+set.getValue().getFpage());
// }
if(driver.toString().contains("null")) {
System.out.println(driver.toString());
driver = new ChromeDriver(chromeOptions);
driver.get(articleLinkByYearIssue.get(k));
}
String lastArticleHandle = driver.getWindowHandle();
WebElement artilceListEle = null;
List<WebElement> elements = null;
String issueURL = "";
try {
List<WebElement> crumbEles = driver.findElements(By.cssSelector(".top-tip a"));
WebElement yearIssueEle = crumbEles.get(1);
yearIssueEle.click();
List<String> windows = new ArrayList<String>(driver.getWindowHandles());
driver.switchTo().window(windows.get(windows.size()-1));
issueURL = driver.getCurrentUrl();
try {Thread.sleep(2000);} catch (InterruptedException e1) {e1.printStackTrace();}
artilceListEle = new WebDriverWait(driver, 10).until(ExpectedConditions.presenceOfElementLocated(By.id("CataLogContent")));
//System.out.println(artilceListEle.getText());
//List<WebElement> elements = artilceListEle.findElements(By.cssSelector("dt,dd"));
elements = new WebDriverWait(driver, 10).until(ExpectedConditions.visibilityOfNestedElementsLocatedBy(artilceListEle, By.cssSelector("dt,dd")));
String categoryName = "";
for (int i = 0; i < elements.size(); i++) {
WebElement element = elements.get(i);
if("dt".equals(element.getTagName())) {
String lanmu = element.getAttribute("innerHTML");
if(!categoryName.equals(lanmu)) categoryName = lanmu;
}else {
WebElement titleEle = element.findElement(By.cssSelector(".name a"));
String link = titleEle.getAttribute("href").trim();
String publisherID = "";
if(link.contains("filename")) { //filename=LCGD201412004&
publisherID = link.substring(link.indexOf("filename=")+9);
if(publisherID.contains("&")) publisherID = publisherID.substring(0, publisherID.indexOf("&"));
}else {
WebElement pageEle = element.findElement(By.cssSelector(".company"));
String page = pageEle.getAttribute("innerHTML").trim();
publisherID = catalog.getYear()+"-"+catalog.getIssue()+"-"+page;
}
// 根据页码获取Article 存储栏目
if(articleMap.containsKey(publisherID)) {
Article newArticle = articleMap.get(publisherID);
newArticle.setCategoryName(categoryName);
newArticle.setCategoryNameCn(categoryName);
articleMap.put(publisherID, newArticle);
}
}
}
} catch (Exception e) {
//JSOUP
// 页码超时,采用Jsoup抓取
if(!"".equals(issueURL)) {
try {
Document document = Jsoup.parse(driver.getPageSource());
Element catalogContent = document.getElementById("CataLogContent");
Elements dtddEles = catalogContent.select("dt,dd");
if(!dtddEles.isEmpty()) {
String categoryName = "";
for (Element element : dtddEles) {
if("dt".equals(element.tagName())) {
String lanmu = element.html();
if(!categoryName.equals(lanmu)) categoryName = lanmu;
}else {
Element titleEle = element.select(".name a").get(0);
String link = titleEle.attr("href").trim();
String publisherID = "";
if(link.contains("filename")) { //filename=LCGD201412004&
publisherID = link.substring(link.indexOf("filename=")+9);
if(publisherID.contains("&")) publisherID = publisherID.substring(0, publisherID.indexOf("&"));
}else {
Element pageEle = element.getElementsByClass("company").get(0);
String page = pageEle.text().trim();
publisherID = catalog.getYear()+"-"+catalog.getIssue()+"-"+page;
}
// 根据页码获取Article 存储栏目
if(articleMap.containsKey(publisherID)) {
Article newArticle = articleMap.get(publisherID);
newArticle.setCategoryName(categoryName);
newArticle.setCategoryNameCn(categoryName);
articleMap.put(publisherID, newArticle);
}
}
//System.out.println("####################################################################################");
//System.out.println(element.toString());
}
}
} catch (Exception e2) {
System.out.println(catalog.getYear()+"_"+catalog.getIssue()+":栏目获取失败!!!");
}
}
}
for (Entry<String, Article> set : articleMap.entrySet()) {
articleList.add(set.getValue());
}
}
}
try {
//清理内存
Runtime runtime = Runtime.getRuntime();
runtime.exec("taskkill /f /im chromedriver.exe /t");
runtime.exec("taskkill /f /im chrome.exe /t");
} catch (Exception e) {
e.printStackTrace();
}
dataMap.put("articles", articleList);
dataMap.put("articleBusinesss", businessList);
return dataMap;
}
public Article getArticleMetaJsoup(Document pageDoc,Journal journal,JournalCatalog catalog,String articleLink) {
Article article = new Article();
List<ArticleAuthor> authorList = new ArrayList<ArticleAuthor>();
List<ArticleAffiliation> affiliationList = new ArrayList<ArticleAffiliation>();
List<ArticleKeyword> keywordList = new ArrayList<ArticleKeyword>();
List<ArticleFundPrj> fundPrjList = new ArrayList<ArticleFundPrj>();
Element titleEle = pageDoc.select(".wx-tit h1").get(0);
String title = titleEle.text();
//作者
Element authorsEle = pageDoc.getElementById("authorpart");
Elements authorEles = authorsEle.select("span a");
Pattern patternSup = Pattern.compile("<sup>(.*)</sup>");
for (int i = 0; i < authorEles.size(); i++) {
String authorStr = authorEles.get(i).text().trim();
Matcher matcher = patternSup.matcher(authorStr);
String tagId = "";
if(matcher.find()) tagId = matcher.group();
authorStr = authorStr.replace(tagId, "");
ArticleAuthor author = new ArticleAuthor();
author.setAuthorName(authorStr);
author.setAuthorNameCn(authorStr);
author.setSortNumber(i+1);
if(!"".equals(tagId)) {
tagId = tagId.replace("<sup>", "").replace("</sup>", "");
author.setAuthorTagVal(tagId);
String tagValue = "";
String[] tagValues = new String[]{};
if(tagId.contains(",")) {
tagValues = tagId.split(",");
}
if(tagValues.length > 1) {
for (int j = 0; j < tagValues.length; j++) {
tagValue += "aff"+tagValues[j]+",";
}
if(tagValue.endsWith(",")) tagValue = tagValue.substring(0, tagValue.length()-1);
}else {
tagValue = "aff"+tagId;
}
author.setAddressTagIds(tagValue);
}
authorList.add(author);
}
//作者地址
Elements elements = pageDoc.select(".wx-tit h3");
Elements addressEles = elements.get(elements.size()-1).getElementsByTag("a");
for (int i = 0; i < addressEles.size(); i++) {
String addressStr = addressEles.get(i).text();
ArticleAffiliation affiliation = new ArticleAffiliation();
affiliation.setSortNumber(i+1);
affiliation.setAddress(addressStr);
affiliation.setAddressCn(addressStr);
affiliationList.add(affiliation);
}
//摘要、关键词、基金、分类号
Elements rowEles = pageDoc.select(".row");
for (Element rowEle : rowEles) {
String content = rowEle.text();
if(content.contains("摘要")) {
String abstractInfo = rowEle.getElementById("ChDivSummary").outerHtml()
.replace("<span id=\"ChDivSummary\" name=\"ChDivSummary\" class=\"abstract-text\">", "").replace("</span>", "");
article.setAbstractinfo(abstractInfo);
article.setAbstractinfoCn(abstractInfo);
}else if(content.contains("关键词")) {
String keywordStr = content.replace("关键词:", "");
String[] keywords = keywordStr.split(";");
for (int i = 0; i < keywords.length; i++) {
ArticleKeyword keyword = new ArticleKeyword();
keyword.setSortNum(i+1);
keyword.setKeyword(keywords[i]);
keyword.setKeywordCn(keywords[i]);
keywordList.add(keyword);
}
}else if(content.contains("基金")) {
String fundPrj = content.replace("基金资助:", "");
ArticleFundPrj articleFundPrj = new ArticleFundPrj();
articleFundPrj.setSortNum(1);
articleFundPrj.setFundsInfo(fundPrj);
articleFundPrj.setFundsInfoCn(fundPrj);
fundPrjList.add(articleFundPrj);
}else if (content.contains("分类号")) {
Elements topSpaceEles = rowEle.getElementsByClass("top-space");
for (int i = 0; i < topSpaceEles.size(); i++) {
String text = topSpaceEles.get(i).text();
if(text.contains("分类号")) {
article.setClcNos(text.replace("分类号:", "").trim());
}else if(text.contains("DOI")) {
article.setDoi(text.replace("DOI:", "").trim());
}
}
}
}
//页码
Element citationEle = pageDoc.select(".top-tip").get(0);
String citationStr = citationEle.text();
String pageStr = "";
Pattern pattern = Pattern.compile("[0-9]+-[0-9]+");
Matcher matcher = pattern.matcher(citationStr);
if(matcher.find()) { //起始页-结束页
pageStr = matcher.group();
String[] pages = pageStr.split("-");
article.setFpage(pages[0]);
article.setLpage(pages[1]);
}else { //结束页
pattern = Pattern.compile("第[0-9]+页");
matcher = pattern.matcher(citationStr);
if(matcher.find()) {
pageStr = matcher.group().replace("第", "").replace("页", "").trim();
article.setFpage(pageStr);
}
}
String publisherId = "";
if(articleLink.contains("filename")) {
publisherId = articleLink.substring(articleLink.indexOf("filename=")).replace("filename=", "");
if(publisherId.contains("&")) publisherId = publisherId.substring(0, publisherId.indexOf("&"));
}
if("".equals(publisherId)) publisherId = catalog.getYear()+"-"+catalog.getIssue()+"-"+pageStr;
article.setId(publisherId);
article.setArticleNo(publisherId);
article.setPublisherId(publisherId);
article.setTitle(title);
article.setTitleCn(title);
article.setLanguage(journal.getLanguage());
article.setYear(catalog.getYear());
article.setIssue(catalog.getIssue());
if(authorList.size() > 0) article.setAuthors(authorList);
if(affiliationList.size() > 0) article.setAffiliations(affiliationList);
if(keywordList.size() > 0) article.setKeywords(keywordList);
if(fundPrjList.size() > 0) article.setFundPrjs(fundPrjList);
article.setJournal(journal);
article.setCatalogId(catalog.getId());
article.setReleaseState(1);
return article;
}
public Article sipderArticleMeta(WebDriver driver,Journal journal,JournalCatalog catalog) {
Article article = new Article();
//获取文章编号
String currentUrl = driver.getCurrentUrl();
String publisherId = "";
if(currentUrl.contains("filename")) {
publisherId = currentUrl.substring(currentUrl.indexOf("filename=")).replace("filename=", "");
if(publisherId.contains("&")) publisherId = publisherId.substring(0, publisherId.indexOf("&"));
}
List<ArticleAuthor> authorList = new ArrayList<ArticleAuthor>();
List<ArticleAffiliation> affiliationList = new ArrayList<ArticleAffiliation>();
List<ArticleKeyword> keywordList = new ArrayList<ArticleKeyword>();
List<ArticleFundPrj> fundPrjList = new ArrayList<ArticleFundPrj>();
WebElement wxtitEle = driver.findElement(By.className("wx-tit"));
WebElement titleEle = wxtitEle.findElement(By.tagName("h1"));
String title = titleEle.getAttribute("innerHTML");
//作者
WebElement authorsEle = wxtitEle.findElement(By.id("authorpart"));
List<WebElement> authorEles = authorsEle.findElements(By.cssSelector("span"));
Pattern patternSup = Pattern.compile("<sup>(.*)</sup>");
for (int i = 0; i < authorEles.size(); i++) {
String authorStr = "";
if(authorEles.get(i).getAttribute("innerHTML").contains("<a")) {
authorStr = authorEles.get(i).findElement(By.tagName("a")).getAttribute("innerHTML").trim();
}else {
authorStr = authorEles.get(i).getAttribute("innerHTML").trim();
}
Matcher matcher = patternSup.matcher(authorStr);
String tagId = "";
if(matcher.find()) tagId = matcher.group();
authorStr = authorStr.replace(tagId, "").replace("<i class=\"icon-email\"></i>", "").trim();
ArticleAuthor author = new ArticleAuthor();
author.setAuthorName(authorStr);
author.setAuthorNameCn(authorStr);
author.setSortNumber(i+1);
if(!"".equals(tagId)) {
tagId = tagId.replace("<sup>", "").replace("</sup>", "");
author.setAuthorTagVal(tagId);
String tagValue = "";
String[] tagValues = new String[]{};
if(tagId.contains(",")) {
tagValues = tagId.split(",");
}
if(tagValues.length > 1) {
for (int j = 0; j < tagValues.length; j++) {
tagValue += "aff"+tagValues[j]+",";
}
if(tagValue.endsWith(",")) tagValue = tagValue.substring(0, tagValue.length()-1);
}else {
tagValue = "aff"+tagId;
}
author.setAddressTagIds(tagValue);
}
authorList.add(author);
}
//作者地址
List<WebElement> findElements = wxtitEle.findElements(By.tagName("h3"));
List<WebElement> addressEles = findElements.get(findElements.size()-1).findElements(By.tagName("a"));
for (int i = 0; i < addressEles.size(); i++) {
String addressStr = addressEles.get(i).getAttribute("innerHTML");
ArticleAffiliation affiliation = new ArticleAffiliation();
affiliation.setSortNumber(i+1);
affiliation.setAddress(addressStr);
affiliation.setAddressCn(addressStr);
affiliationList.add(affiliation);
//findElements.remove(addressEles.get(i));
}
//摘要、关键词、基金、分类号
List<WebElement> rowEles = driver.findElements(By.className("row"));
for (WebElement rowEle : rowEles) {
String content = rowEle.getText();
if(content.contains("摘要")) {
try {
WebElement moreEle = rowEle.findElement(By.id("ChDivSummaryMore"));
if(moreEle.isDisplayed()) moreEle.click();
} catch (Exception e) {
}
String abstractInfo = rowEle.findElement(By.id("ChDivSummary")).getAttribute("innerHTML")
.replace("<span id=\"ChDivSummary\" name=\"ChDivSummary\" class=\"abstract-text\">", "").replace("</span>", "");
article.setAbstractinfo(abstractInfo);
article.setAbstractinfoCn(abstractInfo);
}else if(content.contains("关键词")) {
String keywordStr = content.replace("关键词:", "");
String[] keywords = keywordStr.split(";");
for (int i = 0; i < keywords.length; i++) {
ArticleKeyword keyword = new ArticleKeyword();
keyword.setSortNum(i+1);
keyword.setKeyword(keywords[i]);
keyword.setKeywordCn(keywords[i]);
keywordList.add(keyword);
}
}else if(content.contains("基金")) {
String fundPrj = content.replace("基金资助:", "");
ArticleFundPrj articleFundPrj = new ArticleFundPrj();
articleFundPrj.setSortNum(1);
articleFundPrj.setFundsInfo(fundPrj);
articleFundPrj.setFundsInfoCn(fundPrj);
fundPrjList.add(articleFundPrj);
}else if (content.contains("分类号")) {
List<WebElement> topSpaceEles = rowEle.findElements(By.cssSelector(".top-space"));
for (int i = 0; i < topSpaceEles.size(); i++) {
String text = topSpaceEles.get(i).getAttribute("innerText");
if(text.contains("分类号")) {
article.setClcNos(text.replace("分类号:", "").trim());
}else if(text.contains("DOI")) {
article.setDoi(text.replace("DOI:", "").trim());
}
}
}
}
//页码
WebElement citationEle = driver.findElement(By.className("top-tip"));
String citationStr = citationEle.getText();
String pageStr = "";
Pattern pattern = Pattern.compile("[0-9]+-[0-9]+");
Matcher matcher = pattern.matcher(citationStr);
if(matcher.find()) { //起始页-结束页
pageStr = matcher.group();
String[] pages = pageStr.split("-");
article.setFpage(pages[0]);
article.setLpage(pages[1]);
}else { //结束页
pattern = Pattern.compile("第[0-9]+页");
matcher = pattern.matcher(citationStr);
if(matcher.find()) {
pageStr = matcher.group().replace("第", "").replace("页", "").trim();
article.setFpage(pageStr);
}
}
if("".equals(publisherId)) publisherId = catalog.getYear()+"-"+catalog.getIssue()+"-"+pageStr;
article.setId(publisherId);
article.setArticleNo(publisherId);
article.setPublisherId(publisherId);
article.setTitle(title);
article.setTitleCn(title);
article.setLanguage(journal.getLanguage());
article.setYear(catalog.getYear());
article.setIssue(catalog.getIssue());
if(authorList.size() > 0) article.setAuthors(authorList);
if(affiliationList.size() > 0) article.setAffiliations(affiliationList);
if(keywordList.size() > 0) article.setKeywords(keywordList);
if(fundPrjList.size() > 0) article.setFundPrjs(fundPrjList);
article.setJournal(journal);
article.setCatalogId(catalog.getId());
article.setReleaseState(1);
return article;
}
/**
* 抓取论文链接地址
* @param driver
* @param year
* @param issue
* @return
*/
public static List<String> parseArticleHTMLByYearIssue(WebDriver driver,JournalCatalog catalog){
List<String> articleLinkList = new ArrayList<String>();
List<String> articleHTMLLinkList = new ArrayList<String>();
String year = catalog.getYear();
String issue = catalog.getIssue();
WebDriverWait wait = new WebDriverWait(driver, 10);
List<WebElement> pageEles = null;
try {
driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS);
pageEles = wait.until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector(".page-list a")));
} catch (Exception e) {
//driver.manage().timeouts().implicitlyWait(3, TimeUnit.SECONDS);
pageEles = driver.findElements(By.cssSelector(".page-list a"));
}
if(pageEles != null && !pageEles.isEmpty()) {
Pattern pattern = Pattern.compile("[0-9]+");
for (int i = 0; i < pageEles.size(); i++) {
//判断页码元素值是否为数字
String pageStr = pageEles.get(i).getText().trim();
if("".equals(pageStr)) pageStr = pageEles.get(i).getAttribute("innerHTML").trim();
Matcher matcher = pattern.matcher(pageStr);
if(matcher.find()) {
boolean flag = false;
WebElement yearIssueEle = driver.findElement(By.id("yearissue+"+i));
List<WebElement> yearEles = yearIssueEle.findElements(By.tagName("dl"));
for (int j = 0; j < yearEles.size(); j++) {
WebElement yearEle = yearEles.get(j).findElement(By.tagName("dt"));
String yearText = yearEle.getText().trim();
if("".equals(yearText)) yearText = yearEle.getAttribute("innerText");
if(year.equals(yearText)) { //指定页
flag = true;
pageEles.get(i).click();
break;
}
}
if(flag) {//切换到当前页
for (int j = 0; j < yearEles.size(); j++) {
WebElement yearEle = yearEles.get(j).findElement(By.tagName("dt"));
String yearText = yearEle.getText().trim();
if("".equals(yearText)) yearText = yearEle.getAttribute("innerText");
if(year.equals(yearText)) { //指定年
yearEle.click();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebElement issEle = yearEles.get(j).findElement(By.tagName("dd"));
// String isShow = issEle.getCssValue("display");
// if("none".equals(isShow)) continue;
List<WebElement> issueEles = issEle.findElements(By.tagName("a"));
for (int k = 0; k < issueEles.size(); k++) {
WebElement issueEle = issueEles.get(k);
if(Integer.parseInt(issue) < 10 && !issue.contains("0")) issue = "0"+issue;
if(issue.equals(issueEle.getAttribute("innerHTML").replace("No.", "").trim())) { //指定期
issueEle.click();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebElement catalogContentEle = driver.findElement(By.id("CataLogContent"));
List<WebElement> articleByCatalogEles = catalogContentEle.findElements(By.cssSelector("dd .name a"));
for (WebElement articleEle : articleByCatalogEles) {
String href = articleEle.getAttribute("href");
String parameter = href.substring(href.indexOf("?")); //fileName
String artileTitle = articleEle.getText();
String articleUrl = "https://kns.cnki.net/kcms/detail/detail.aspx"+parameter;
String articleHTMLUrl = "https://kns.cnki.net/KXReader/Detail"+parameter;
//System.out.println(artileTitle);
//System.out.println(articleUrl);
//System.out.println(articleHTMLUrl);
articleLinkList.add(articleUrl);
articleHTMLLinkList.add(articleHTMLUrl);
}
}
}
}
}
//System.out.println("==============数据抓取完成");
System.out.println("========"+year+"_"+issue+"========"+articleLinkList.size()+"篇");
driver.quit();
break;
}
}else {
WebElement yearIssueEle = driver.findElement(By.id("yearissue+0"));
List<WebElement> yearEles = yearIssueEle.findElements(By.tagName("dl"));
for (int j = 0; j < yearEles.size(); j++) {
WebElement yearEle = yearEles.get(j).findElement(By.tagName("dt"));
String yearText = yearEle.getText().trim();
if("".equals(yearText)) yearText = yearEle.getAttribute("innerText");
if(year.equals(yearText)) { //只有一页,指定年
yearEle.click();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebElement issEle = yearEles.get(j).findElement(By.tagName("dd"));
// String isShow = issEle.getCssValue("display");
// if("none".equals(isShow)) continue;
List<WebElement> issueEles = issEle.findElements(By.tagName("a"));
for (int k = 0; k < issueEles.size(); k++) {
WebElement issueEle = issueEles.get(k);
if(Integer.parseInt(issue) < 10 && !issue.contains("0")) issue = "0"+issue;
if(issue.equals(issueEle.getAttribute("innerHTML").replace("No.", "").trim())) { //指定期
issueEle.click();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebElement catalogContentEle = driver.findElement(By.id("CataLogContent"));
List<WebElement> articleByCatalogEles = catalogContentEle.findElements(By.cssSelector("dd .name a"));
for (WebElement articleEle : articleByCatalogEles) {
String href = articleEle.getAttribute("href");
String parameter = href.substring(href.indexOf("?")); //fileName
String artileTitle = articleEle.getText();
String articleUrl = "https://kns.cnki.net/kcms/detail/detail.aspx"+parameter;
String articleHTMLUrl = "https://kns.cnki.net/KXReader/Detail"+parameter;
System.out.println(artileTitle);
System.out.println(articleUrl);
System.out.println(articleHTMLUrl);
articleLinkList.add(articleUrl);
articleHTMLLinkList.add(articleHTMLUrl);
}
}
}
}
}
driver.quit();
}
}
}
return articleLinkList;
}
/**
* 【3】知网全文HTML页抓取
* @param entityManager
* @param journalPath
* @param journalURL
* @param catalog
* @return
*/
@SuppressWarnings("unchecked")
public void spiderArticleHTML(EntityManager entityManager,String journalPath,String journalURL,JournalCatalog catalog) {
if(catalog.getRemark().contains("HTML")) return;
System.out.println("&&&&&&&&&&&&&&&&&&&&&&&&&&& "+catalog.getYear()+"_"+catalog.getIssue()+":开始处理");
List<Article> newArticleList = new ArrayList<Article>();
List<Article> articleList = entityManager.createQuery("FROM "+Article.class.getName()+" WHERE catalogId='"+catalog.getId()+"'").getResultList();
try {
// 谷歌驱动配置
WebDriver driver = null;
ChromeDriverService service = null;
String chromeDriverPath = "D:\\webDriver\\chromedriver.exe";
HashMap<String, Object> chromePrefs = new HashMap<String, Object>();
ChromeOptions chromeOptions = new ChromeOptions();
try {
chromePrefs.put("download.default_directory", "D:\\webDriver");
File chromeDriverFile = new File(chromeDriverPath);
System.setProperty("webdriver.chrome.driver", chromeDriverPath);
chromeOptions.setExperimentalOption("prefs", chromePrefs);
// 设置为 headless 模式 (必须)
// chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--disable-gpu");
chromeOptions.addArguments("--no-sandbox");
// 禁止弹出拦截
chromeOptions.addArguments("--disable-popup-blocking");
// 禁止默认浏览器检查
chromeOptions.addArguments("no-default-browser-check");
chromeOptions.addArguments("about:histograms");
chromeOptions.addArguments("about:cache");
chromeOptions.addArguments("--start-maximized");
// 创建一个 ChromeDriver 接口
service = new ChromeDriverService.Builder().usingDriverExecutable(chromeDriverFile).usingAnyFreePort()
.build();
service.start();
} catch (IOException e1) {
e1.printStackTrace();
}
driver = new ChromeDriver(chromeOptions);
int htmlCount = 0;
for (int i = 0; i < articleList.size(); i++) {
CloseableHttpClient client = HttpClients.createDefault();
String firstPage = "";
try {
Article article = articleList.get(i);
if("HTML".equals(article.getRemark2())) {
htmlCount = htmlCount + 1;
continue; //表示抓取过知网HTML页面
}
String link = "https://kns.cnki.net/kcms/detail/detail.aspx?sfield=FN&dbCode=CJFD&filename="+article.getId()+"&tableName=CJFD2000&url=";
HttpGet get = new HttpGet(link);
get.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
CloseableHttpResponse response = client.execute(get);
HttpEntity entity = response.getEntity();
String pageContent = EntityUtils.toString(entity, "UTF-8");
Document pageDoc = Jsoup.parse(pageContent);
Element btnEles = pageDoc.getElementById("DownLoadParts");
String btnName = btnEles.text();
if (!btnName.contains("HTML阅读")) continue; //没有全文按钮不抓取
driver.manage().timeouts().implicitlyWait(15, TimeUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(30, TimeUnit.SECONDS);
// 跳转全文页
driver.get(link);
try {
Thread.sleep(3000);
} catch (Exception e) {
// TODO: handle exception
}
firstPage = driver.getWindowHandle();
WebElement element = driver.findElement(By.id("DownLoadParts"));
WebElement btnHTMLEle = element.findElement(By.cssSelector(".btn-html a"));
btnHTMLEle.click();
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
e.printStackTrace();
}
List<String> windows = new ArrayList<String>(driver.getWindowHandles());
driver.switchTo().window(windows.get(windows.size() - 1));
if("安全验证".equals(driver.getTitle())) {
while (true) {
if(!"安全验证".equals(driver.getTitle())) break;
}
}
Document document = Jsoup.parse(driver.getPageSource());
System.out.println(article.getTitle()+" 开始抓取HTML");
Element contentEle = document.getElementsByClass("content").get(0);
Elements titleEles = contentEle.getElementsByTag("h1");
if (titleEles.size() > 1) { // 包含英文标题再继续
List<ArticleAuthor> authorList = new ArrayList<ArticleAuthor>();
List<ArticleAffiliation> affiliationList = new ArrayList<ArticleAffiliation>();
List<ArticleKeyword> keywordList = new ArrayList<ArticleKeyword>();
List<ArticleReference> referenceList = new ArrayList<ArticleReference>();
Element titleEnEle = titleEles.get(titleEles.size() - 1);
String titleEn = titleEnEle.text();
List<String> bioList = new ArrayList<String>();
Elements briefEles = contentEle.select(".brief");
if(briefEles.size() > 0) {
Elements pEles = briefEles.get(0).getElementsByTag("p");
for (int j = 0; j < pEles.size(); j++) {
String content = pEles.get(j).text();
if(content.contains("作者简介")) {
Elements authroBioEles = pEles.get(j).getElementsByTag("span");
for (int k = 0; k < authroBioEles.size(); k++) {
bioList.add(authroBioEles.get(k).text());
}
}else if(content.contains("收稿日期")) {
String received = content.replace("收稿日期:", "");
article.setReceivedDate(new SimpleDateFormat("yyyy-MM-dd").parse(received));
}
}
}
Element authorEnEle = titleEnEle.nextElementSibling();
if ("h2".equals(authorEnEle.tagName())) {
Elements authorEnEles = authorEnEle.getElementsByTag("span");
List<ArticleAuthor> authors = article.getAuthors();
if(authors.size() != authorEnEles.size()) {
//补救数据--------------------------------------------------------------------------------------------
Element authorCnEle = contentEle.selectFirst(".top-title + h2");
Elements authorCnEles = authorCnEle.select("a");
System.out.println("中英文作者不一致:"+article.getId()+"["+authors.size()+"-CN:"+authorCnEles.size()+"-EN:"+authorEnEles.size()+"]");
for (int j = 0; j < authorCnEles.size(); j++) { //遍历中文
Element authorCnStrEle = authorCnEles.get(j);
String authorCn = authorCnStrEle.text();
boolean flag = false; //判断作者集合是否存在当前作者
for (int k = 0; k < authors.size(); k++) { //遍历库中作者
ArticleAuthor author = authors.get(k);
if(authorCn.equals(author.getAuthorNameCn())) {
flag = true;
try {
author.setAuthorNameEn(authorEnEles.get(j).text());
} catch (Exception e) {
// TODO: handle exception
}
author.setSortNumber(j+1);
if(bioList.size() > 0) {
for (int j2 = 0; j2 < bioList.size(); j2++) {
if(bioList.get(j2).contains(author.getAuthorNameCn())) {
author.setBio(bioList.get(j2));
author.setBioCn(bioList.get(j2));
}
}
}
authorList.add(author);
}
}
if(!flag) {
ArticleAuthor author = new ArticleAuthor();
author.setArticleId(article.getId());
author.setAuthorName(authorCn);
author.setAuthorNameCn(authorCn);
try {
author.setAuthorNameEn(authorEnEles.get(j).text());
} catch (Exception e) {
// TODO: handle exception
}
author.setSortNumber(j+1);
if(bioList.size() > 0) {
for (int j2 = 0; j2 < bioList.size(); j2++) {
if(bioList.get(j2).contains(author.getAuthorNameCn())) {
author.setBio(bioList.get(j2));
author.setBioCn(bioList.get(j2));
}
}
}
authorList.add(author);
}
}
//-----------------------------------------------------------------------------------------------------
}else {
for (int j = 0; j < authors.size(); j++) {
ArticleAuthor author = authors.get(j);
author.setAuthorNameEn(authorEnEles.get(j).text());
if(bioList.size() > 0) {
for (int j2 = 0; j2 < bioList.size(); j2++) {
if(bioList.get(j2).contains(author.getAuthorNameCn())) {
author.setBio(bioList.get(j2));
author.setBioCn(bioList.get(j2));
}
}
}
authorList.add(author);
}
}
}
Element addressEnEle = authorEnEle.nextElementSibling();
if ("h2".equals(addressEnEle.tagName())) {
Elements addressEnEles = addressEnEle.getElementsByTag("span");
List<ArticleAffiliation> affiliations = article.getAffiliations();
if(affiliations.size() != addressEnEles.size()) {
System.out.println("中英文地址不一致:"+article.getId());
Element addressCnEle = contentEle.selectFirst(".top-title + h2 + h2");
Elements addressCnEles = addressCnEle.select("a,span");
for (int j = 0; j < addressCnEles.size(); j++) {
String addressCn = addressCnEles.get(j).text();
ArticleAffiliation affiliation = null;
if(affiliations.size() > j) {
affiliation = affiliations.get(j);
}else {
affiliation = new ArticleAffiliation();
affiliation.setArticleId(article.getId());
}
affiliation.setAddress(addressCn);
affiliation.setAddressCn(addressCn);
try {
affiliation.setAddressEn(addressEnEles.get(j).text());
} catch (Exception e) {
// TODO: handle exception
}
affiliation.setSortNumber(j+1);
affiliationList.add(affiliation);
}
}else {
for (int j = 0; j < affiliations.size(); j++) {
ArticleAffiliation affiliation = affiliations.get(j);
affiliation.setAddressEn(addressEnEles.get(j).text());
affiliationList.add(affiliation);
}
}
}
Element abstractEle = contentEle.select("#a_abstractEN p").get(0);
String abstractEn = abstractEle.outerHtml().replace("<p>", "").replace("</p>", "");
Elements keywordEles = contentEle.select("#a_keywordsEN p a");
if(keywordEles.size() > 0) {
List<ArticleKeyword> keywords = article.getKeywords();
if(keywords.size() != keywordEles.size()) {
Element keywordCnEle = contentEle.getElementById("a_keywords");
Elements keywordCnEles = keywordCnEle.select("p a");
System.out.println("中英文关键词不一致:"+article.getId()+"["+keywords.size()+"-CN:"+keywordCnEles.size()+"-EN:"+keywordEles.size()+"]");
if(keywordCnEles.size() >= keywordEles.size()) {
for (int j = 0; j < keywordCnEles.size(); j++) {
String keywordCn = keywordCnEles.get(j).text();
ArticleKeyword keyword = null;
if(keywords.size() > j) {
keyword = keywords.get(j);
}else {
keyword = new ArticleKeyword();
keyword.setArticleId(article.getId());
}
keyword.setKeyword(keywordCn);
keyword.setKeywordCn(keywordCn);
try {
keyword.setKeywordEn(keywordEles.get(j).text());
} catch (Exception e) {
if(keywordEles.size() < keywordCnEles.size()) {
System.out.println("英文关键词比中文少:"+(keywordCnEles.size()-keywordEles.size()));
}
}
keyword.setSortNum(j+1);
keywordList.add(keyword);
}
}else { //英文多于中文关键词
for (int j = 0; j < keywordEles.size(); j++) {
String keywordEn = keywordEles.get(j).text();
ArticleKeyword keyword = null;
if(keywords.size() > j) {
keyword = keywords.get(j);
}else {
keyword = new ArticleKeyword();
keyword.setArticleId(article.getId());
}
keyword.setKeywordEn(keywordEn);
try {
keyword.setKeyword(keywordCnEles.get(j).text());
keyword.setKeywordCn(keywordCnEles.get(j).text());
} catch (Exception e) {
keyword.setKeyword("");
keyword.setKeywordCn("");
if(keywordEles.size() > keywordCnEles.size()) {
System.out.println("英文关键词比中文多:"+(keywordEles.size()-keywordCnEles.size()));
}
}
keyword.setSortNum(j+1);
keywordList.add(keyword);
}
}
}else {
for (int j = 0; j < keywords.size(); j++) {
String keywordStr = keywordEles.get(j).text();
if(!keywordStr.contains("&")) keywordStr = keywordStr.replace(";", "");
ArticleKeyword keyword = keywords.get(j);
keyword.setKeywordEn(keywordStr);
keywordList.add(keyword);
}
}
}
Elements referEles = contentEle.select("#a_bibliography p");
for (int j = 0; j < referEles.size(); j++) {
Element referEle = referEles.get(j).selectFirst("a");
String sortStr = "";
if(referEle.selectFirst("b") != null) {
sortStr = referEle.selectFirst("b").text();
}else {
sortStr = (j+1)+"";
}
String referStr = referEle.html().replace(sortStr, "").replace("<b>", "").replace("</b>", "");
ArticleReference reference = new ArticleReference();
reference.setArticleId(article.getId());
reference.setAllinfo(referStr);
if("[]".equals(sortStr)) sortStr = (j+1)+"";
reference.setSortnum(Integer.parseInt(sortStr.replace("[", "").replace("]", "")));
Pattern pattern = Pattern.compile("[\u4e00-\u9fa5]");
Matcher matcher = pattern.matcher(referStr);
if(matcher.find()) { //中文
reference.setAllinfoPartCn(referStr);
}else {
reference.setAllinfoPartEn(referStr);
}
referenceList.add(reference);
}
article.setTitleEn(titleEn);
article.setAbstractinfoEn(abstractEn);
article.setAuthors(authorList);
article.setAffiliations(affiliationList);
article.setKeywords(keywordList);
article.setRefers(referenceList);
article.setRemark2("HTML");
newArticleList.add(article);
}
} catch (Exception e) {
e.printStackTrace();
}
//driver.quit();
if(driver.toString().contains("null")) {
driver = new ChromeDriver(chromeOptions);
}else {
//System.out.println(driver.toString());
List<String> pageList = new ArrayList<String>(driver.getWindowHandles());
for (int j = 0; j < pageList.size(); j++) {
//if(firstPage.equals(pageList.get(j))) continue;
if(j == 0) continue;
driver.switchTo().window(pageList.get(j));
driver.close();
}
//System.out.println(driver.toString());
pageList = new ArrayList<String>(driver.getWindowHandles());
driver.switchTo().window(pageList.get(pageList.size() - 1));
// pageList = new ArrayList<String>(driver.getWindowHandles());
// if(pageList.size() == 1) {
// driver.switchTo().window(pageList.get(0));
// }else {
// driver.manage().deleteAllCookies();
// driver.quit();
// driver = new ChromeDriver(chromeOptions);
// }
}
}
driver.manage().deleteAllCookies();
driver.quit();
//抓取完成后,验证目次是否已被更新过论文数据,“1”表示已存储并跳过
JournalCatalog journalCatalog = (JournalCatalog) entityManager.createQuery("FROM "+JournalCatalog.class.getName()+" where id=:id").setParameter("id", catalog.getId()).getSingleResult();
if(journalCatalog.getRemark().contains("HTML")) return; //表示已抓取
catalog.setRemark(catalog.getRemark()+"_HTML");
try {
if(newArticleList.size() > 0) {
entityManager.getTransaction().begin();
for (int i = 0; i < newArticleList.size(); i++) {
Article article = newArticleList.get(i);
entityManager.persist(article);
}
entityManager.getTransaction().commit();
}
if(articleList.size() == htmlCount) {
entityManager.getTransaction().begin();
entityManager.persist(catalog);
entityManager.getTransaction().commit();
}
} catch (Exception e) {
entityManager.getTransaction().rollback();
e.printStackTrace();
System.out.println(catalog.getYear()+"_"+catalog.getIssue()+":该期已抓取提交,跳过!!");
}
} catch (Exception e) {
e.printStackTrace();
System.out.println(catalog.getYear()+"_"+catalog.getIssue()+":数据抓取失败!!");
}
}
/**
* 获取期刊目次集合
* @param driver
* @return
*/
public static List<JournalCatalog> getCnkiJournalCatalogList(WebDriver driver){
WebElement coverImgEle = driver.findElement(By.cssSelector("#J_journalPic img"));
String coverImgURL = coverImgEle.getAttribute("src").startsWith("http")? coverImgEle.getAttribute("src"):"http:"+coverImgEle.getAttribute("src");
boolean flag = false; //判断图片url是否带期号
String imgName = coverImgURL.substring(coverImgURL.lastIndexOf("/")+1);
// System.out.println(coverImgURL);
List<JournalCatalog> issueList = new ArrayList<JournalCatalog>();
JournalCatalog catalog = null;
List<WebElement> pageEles = driver.findElements(By.cssSelector(".page-list a"));
if(pageEles != null) {
Pattern pattern = Pattern.compile("[0-9]+");
//判断图片名称是否包含数字
Matcher matcherImg = pattern.matcher(imgName);
if(matcherImg.find()) flag = true;
for (int i = 0; i < pageEles.size(); i++) {
//判断页码元素值是否为数字
String pageStr = pageEles.get(i).getText().trim();
if("".equals(pageStr)) pageStr = pageEles.get(i).getAttribute("innerHTML").trim();
Matcher matcher = pattern.matcher(pageStr);
if(matcher.find()) {
WebElement yearIssueEle = driver.findElement(By.id("yearissue+"+i));
List<WebElement> yearEles = yearIssueEle.findElements(By.tagName("dl"));//年
for(int j = 0;j < yearEles.size();j++) {
WebElement yearEle = yearEles.get(j).findElement(By.tagName("dt"));
String year = yearEle.getText().trim();
if("".equals(year)) year = yearEle.getAttribute("innerHTML").trim().replace("<em>", "").replace("</em>", "");
List<WebElement> issueEles = yearEles.get(j).findElements(By.tagName("a"));//期
for (WebElement issueEle : issueEles) {
String issueId = issueEle.getAttribute("id");
String issueNo = issueEle.getText().trim().replace("No.", "");
if("".equals(issueNo)) issueNo = issueEle.getAttribute("innerHTML").trim().replace("No.", "");
String currentIssueImgName = "";
if(flag) {
currentIssueImgName = imgName.substring(0, 4)+issueId.replace("yq", "")+".jpg";
}else {
currentIssueImgName = imgName.replace(".jpg", issueId.replace("yq", "")+".jpg");
}
catalog = new JournalCatalog();
catalog.setId(issueId);
catalog.setIssue(issueNo);
catalog.setYear(year);
catalog.setCoverImgSrc("journal/img/cover/"+currentIssueImgName);
issueList.add(catalog);
}
}
}
}
return issueList;
}
return null;
}
public static List<String> parseIssueHTML(WebDriver driver){
List<WebElement> pageEles = driver.findElements(By.cssSelector(".page-list a"));
if(pageEles != null) {
Pattern pattern = Pattern.compile("[0-9]+");
for (int i = 0; i < pageEles.size(); i++) {
//判断页码元素值是否为数字
String pageStr = pageEles.get(i).getText().trim();
if("".equals(pageStr)) pageStr = pageEles.get(i).getAttribute("innerHTML").trim();
Matcher matcher = pattern.matcher(pageStr);
if(matcher.find()) {
WebElement yearIssueEle = driver.findElement(By.id("yearissue+"+i));
List<WebElement> yearEles = yearIssueEle.findElements(By.tagName("dl"));
for (int j = 0; j < yearEles.size(); j++) {
WebElement yearEle = yearEles.get(j).findElement(By.tagName("dt"));
yearEle.click();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
WebElement issEle = yearEles.get(j).findElement(By.tagName("dd"));
String isShow = issEle.getCssValue("display");
if("none".equals(isShow)) continue;
List<WebElement> issueEles = issEle.findElements(By.tagName("a"));
for (int k = 0; k < issueEles.size(); k++) {
WebElement issueEle = issueEles.get(k);
issueEle.click();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
WebElement catalogContentEle = driver.findElement(By.id("CataLogContent"));
List<WebElement> articleByCatalogEles = catalogContentEle.findElements(By.cssSelector("dd .name a"));
for (WebElement articleEle : articleByCatalogEles) {
String href = articleEle.getAttribute("href");
System.out.println(articleEle.getAttribute("href"));
String query = href.substring(href.indexOf("?"));
String articleUrl = "https://kns.cnki.net/kcms/detail/detail.aspx"+query;
try {
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpPost get = new HttpPost(articleUrl);
get.addHeader("User-Agent",userAgent);
CloseableHttpResponse response = httpClient.execute(get);
HttpEntity entity = response.getEntity();
InputStream in = entity.getContent();
Document parse = Jsoup.parse(in, "utf-8", "");
Document document = Jsoup.connect(articleUrl).get();
String content = document.outerHtml();
} catch (Exception e) {
// TODO: handle exception
}
articleEle.click();
List<String> tabs = new ArrayList<String> (driver.getWindowHandles()); //存储多窗口句柄
//切换新标签页
WebDriver driver2 = driver.switchTo().window(tabs.get(2));
WebElement downloadEle = driver2.findElement(By.id("DownLoadParts"));
WebElement htmlEle = downloadEle.findElement(By.className("icon-dlcrsp"));
String htmlRead = htmlEle.getAttribute("innerHTML").trim();
}
}
}
}
}
}
return null;
}
/**
* 获取期刊知网地址
*/
public static void getJournalURL() {
String indexURL = "http://navi.cnki.net/knavi/Journal.html";
WebDriver driver = null;
ChromeDriverService service = null;
String chromeDriverPath = "D:\\webDriver\\chromedriver.exe";
try {
HashMap<String, Object> chromePrefs = new HashMap<String, Object>();
chromePrefs.put("download.default_directory", "D:\\webDriver");
File chromeDriverFile = new File(chromeDriverPath);
System.setProperty("webdriver.chrome.driver",chromeDriverPath);
ChromeOptions chromeOptions = new ChromeOptions();
chromeOptions.setExperimentalOption("prefs", chromePrefs);
//设置为 headless 模式 (必须)
chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--disable-gpu");
chromeOptions.addArguments("--no-sandbox");
// 禁止弹出拦截
chromeOptions.addArguments("--disable-popup-blocking");
// 禁止默认浏览器检查
chromeOptions.addArguments("no-default-browser-check");
chromeOptions.addArguments("about:histograms");
chromeOptions.addArguments("about:cache");
chromeOptions.addArguments("--start-maximized");
//创建一个 ChromeDriver 接口
service = new ChromeDriverService.Builder().usingDriverExecutable(chromeDriverFile).usingAnyFreePort().build();
service.start();
driver = new ChromeDriver(chromeOptions);
} catch (IOException e1) {
e1.printStackTrace();
}
driver.get(indexURL);
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebDriverWait wait = new WebDriverWait(driver, 30);
Boolean isShow = wait.until(ExpectedConditions.titleContains("出版来源导航"));
WebElement journalNameEle = driver.findElement(By.name("txt_1_value1"));
String journalName = "";
if(journalNameEle != null) {
Scanner scanner = new Scanner(System.in);
System.out.println("请输入期刊名称:");
journalName = scanner.next();
journalNameEle.sendKeys(journalName);
}
WebElement search = driver.findElement(By.id("btnSearch"));
search.click();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebElement totalcountEle = driver.findElement(By.className("totalcount"));
WebElement journalCountBySelEle = totalcountEle.findElement(By.className("lblCount"));
int journalCount = Integer.parseInt(journalCountBySelEle.getText());
if(journalCount == 0) {
System.out.println("未查询或知网未收录本刊:"+journalName);
return;
}else if(journalCount == 1){
System.out.println("查询到了!!!!!!!!!!!!!!!!!!!!!!!!!!!!!恭喜");
WebElement resultEle = driver.findElement(By.cssSelector(".result .list_tup"));
List<WebElement> journalEles = null;
if(resultEle != null) journalEles = resultEle.findElements(By.cssSelector("li a"));
if(journalEles != null && journalEles.size() > 0) {
WebElement journalEle = journalEles.get(0);
journalEle.click();
}
}else {
// 大于1条
WebElement resultEle = driver.findElement(By.cssSelector(".result .list_tup"));
List<WebElement> journalEles = null;
if(resultEle != null) journalEles = resultEle.findElements(By.cssSelector("li a"));
if(journalEles != null && journalEles.size() > 0) {
for(int i=0;i<journalEles.size();i++) {
String journalTitle = journalEles.get(i).getAttribute("title");
System.out.println(i+1 +": "+ journalTitle);
}
System.out.println("选择期刊:(选择期刊进行抓取)");
Scanner scanner = new Scanner(System.in);
String journalIndex = scanner.next();
WebElement journalEle = journalEles.get(Integer.parseInt(journalIndex)-1);
journalEle.click();
}
}
// System.out.println(driver.getCurrentUrl());
// System.out.println(driver.getTitle());
List<String> tabs = new ArrayList<String> (driver.getWindowHandles()); //存储多窗口句柄
// for (String string : tabs) {
// System.out.println(string);
// }
//切换新标签页
WebDriver driver2 = driver.switchTo().window(tabs.get(1));
String journalURL = driver2.getCurrentUrl();
String journalPublisherId = journalURL.substring(journalURL.indexOf("pykm=")+5);
if(journalPublisherId.indexOf("&") !=-1) journalPublisherId = journalPublisherId.substring(0, journalPublisherId.indexOf("&"));
System.out.println(driver2.getCurrentUrl());
System.out.println(driver2.getTitle());
driver2.quit();
}
public void downloadPDF(String journalUrl,String journalName,String year,String issue) {
Set<String> publisherIdSet = new HashSet<String>();
//查询目次catalog、journal对象
JournalCatalog catalog = new JournalCatalog();
catalog.setYear(year);
catalog.setIssue(issue);
File browserSavePath = new File("C:\\Users\\wsh\\Downloads");
String language = "cn";
System.setProperty("webdriver.edge.driver","D:\\webDriver\\msedgedriver.exe");
Map<String,Object> dataMap = new HashMap<String,Object>();
List<Article> articleList = new ArrayList<Article>();
List<ArticleBusiness> businessList = new ArrayList<ArticleBusiness>();
WebDriver driver = new EdgeDriver();
//driver.manage().window().maximize();
driver.get(journalUrl);
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
List<String> articleLinkByYearIssue = (List<String>) parseArticleHTMLByYearIssue(driver,catalog);
driver = new EdgeDriver();
Map<String, Article> articleMap = new HashMap<String, Article>();
for (int k = 0; k < articleLinkByYearIssue.size(); k++) {
try {
driver.get(articleLinkByYearIssue.get(k));
} catch (Exception e) {
try {
driver.manage().wait(3000);
driver.notify();
driver.get(articleLinkByYearIssue.get(k));
} catch (Exception e2) {
System.out.println("页面打开失败!!!!");
}
}
WebDriverWait wait = new WebDriverWait(driver, 10);
WebElement btnsEle = wait.until(ExpectedConditions.presenceOfElementLocated(By.id("DownLoadParts")));
//WebElement btnsEle = driver.findElement(By.id("DownLoadParts"));
List<WebElement> authorEles = driver.findElements(By.cssSelector("#authorpart span"));
if(!authorEles.isEmpty()) {
WebElement authorEle = authorEles.get(0);
String authorStr = authorEle.getText();
FileFilter filter = new FileFilter() {
@Override
public boolean accept(File file) {
if(file.isFile()) {
if(file.getName().contains(authorStr)) {
return true;
}else {
return false;
}
}
return false;
}
};
File[] files = browserSavePath.listFiles(filter);
if(files.length > 0) {
for (File file : files) {
System.out.println(file.getAbsolutePath() + " 文件重复啦啦啦啦啦!!!");
}
continue;
}
}
WebElement btnDownloadEle = null;
try {
btnDownloadEle = btnsEle.findElement(By.cssSelector(".btn-dlpdf a"));
} catch (Exception e) {
System.out.println("未找到下载按钮!!!");
continue;
}
btnDownloadEle.click();
try {
Thread.sleep(5000);
} catch (Exception e1) {
}
List<String> windowHandles = new ArrayList<String>(driver.getWindowHandles());
if(windowHandles.size()>1) {
try {
driver.switchTo().window(windowHandles.get(windowHandles.size()-1));
if(driver.getCurrentUrl().contains("https://kdoc.cnki.net/kdoc/download.aspx")) {
while (true) {
windowHandles = new ArrayList<String>(driver.getWindowHandles());
if(windowHandles.size() == 1) break;
}
windowHandles = new ArrayList<String>(driver.getWindowHandles());
driver.switchTo().window(windowHandles.get(windowHandles.size()-1));
}
} catch (Exception e) {
windowHandles = new ArrayList<String>(driver.getWindowHandles());
driver.switchTo().window(windowHandles.get(windowHandles.size()-1));
continue;
}
}else {
windowHandles = new ArrayList<String>(driver.getWindowHandles());
driver.switchTo().window(windowHandles.get(windowHandles.size()-1));
}
//driver = new EdgeDriver();
}
//C:\Users\wsh\Downloads
//修改下载文件目录
driver = new EdgeDriver();
driver.get(journalUrl);
WebDriverWait wait = new WebDriverWait(driver, 10);
List<WebElement> pageEles = null;
try {
pageEles = wait.until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector(".page-list a")));
} catch (Exception e) {
driver.manage().timeouts().implicitlyWait(3, TimeUnit.SECONDS);
pageEles = driver.findElements(By.cssSelector(".page-list a"));
}
if(pageEles != null && !pageEles.isEmpty()) {
Pattern pattern = Pattern.compile("[0-9]+");
for (int i = 0; i < pageEles.size(); i++) {
//判断页码元素值是否为数字
String pageStr = pageEles.get(i).getText().trim();
if("".equals(pageStr)) pageStr = pageEles.get(i).getAttribute("innerHTML").trim();
Matcher matcher = pattern.matcher(pageStr);
if(matcher.find()) {
WebElement yearIssueEle = driver.findElement(By.id("yearissue+"+i));
List<WebElement> yearEles = yearIssueEle.findElements(By.tagName("dl"));
for (int j = 0; j < yearEles.size(); j++) {
WebElement yearEle = yearEles.get(j).findElement(By.tagName("dt"));
if(year.equals(yearEle.getText().trim())) { //指定年
yearEle.click();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
wait = new WebDriverWait(driver, 10);
WebElement issEle = wait.until(ExpectedConditions.presenceOfNestedElementLocatedBy(yearEles.get(j), By.tagName("dd")));
//WebElement issEle = yearEles.get(j).findElement(By.tagName("dd"));
String isShow = issEle.getCssValue("display");
if("none".equals(isShow)) continue;
List<WebElement> issueEles = issEle.findElements(By.tagName("a"));
for (int k = 0; k < issueEles.size(); k++) {
WebElement issueEle = issueEles.get(k);
if(Integer.parseInt(issue) < 10 && !issue.contains("0")) issue = "0"+issue;
if(issue.equals(issueEle.getAttribute("innerHTML").replace("No.", "").trim())) { //指定期
issueEle.click();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebElement catalogContentEle = driver.findElement(By.id("CataLogContent"));
List<WebElement> elements = catalogContentEle.findElements(By.cssSelector("dd"));
String categoryName = "";
for (int a = 0; a < elements.size(); a++) {
WebElement element = elements.get(a);
if("dt".equals(element.getTagName())) {
String lanmu = element.getAttribute("innerHTML");
if(!categoryName.equals(lanmu)) categoryName = lanmu;
}else {
WebElement authorEle = element.findElement(By.cssSelector(".author"));
String author = authorEle.getAttribute("innerHTML").trim();
String[] authors = author.split(";");
WebElement pageEle = element.findElement(By.cssSelector(".company"));
String page = pageEle.getAttribute("innerHTML").trim();
String publisherID = year+"-"+issue+"-"+page;
// 根据页码获取Article 存储栏目
File downPath = new File("C:\\Users\\wsh\\Downloads");
File[] listFiles = downPath.listFiles();
for (File file : listFiles) {
if(file.getName().contains(authors[0])) {
try {
FileUtils.copyFile(file, new File("D:\\webDriver\\"+journalName+"\\"+publisherID+".pdf"));
file.delete();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
}
}
}
}
}
}
driver.quit();
}else {
WebElement yearIssueEle = driver.findElement(By.id("yearissue+0"));
List<WebElement> yearEles = yearIssueEle.findElements(By.tagName("dl"));
for (int j = 0; j < yearEles.size(); j++) {
WebElement yearEle = yearEles.get(j).findElement(By.tagName("dt"));
if(year.equals(yearEle.getText().trim())) { //指定年
yearEle.click();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
wait = new WebDriverWait(driver, 10);
WebElement issEle = wait.until(ExpectedConditions.presenceOfNestedElementLocatedBy(yearEles.get(j), By.tagName("dd")));
//WebElement issEle = yearEles.get(j).findElement(By.tagName("dd"));
String isShow = issEle.getCssValue("display");
if("none".equals(isShow)) continue;
List<WebElement> issueEles = issEle.findElements(By.tagName("a"));
for (int k = 0; k < issueEles.size(); k++) {
WebElement issueEle = issueEles.get(k);
if(Integer.parseInt(issue) < 10 && !issue.contains("0")) issue = "0"+issue;
if(issue.equals(issueEle.getAttribute("innerHTML").replace("No.", "").trim())) { //指定期
issueEle.click();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebElement catalogContentEle = driver.findElement(By.id("CataLogContent"));
List<WebElement> elements = catalogContentEle.findElements(By.cssSelector("dd"));
String categoryName = "";
for (int a = 0; a < elements.size(); a++) {
WebElement element = elements.get(a);
if("dt".equals(element.getTagName())) {
String lanmu = element.getAttribute("innerHTML");
if(!categoryName.equals(lanmu)) categoryName = lanmu;
}else {
WebElement authorEle = element.findElement(By.cssSelector(".author"));
String author = authorEle.getAttribute("innerHTML").trim();
String[] authors = author.split(";");
WebElement pageEle = element.findElement(By.cssSelector(".company"));
String page = pageEle.getAttribute("innerHTML").trim();
String publisherID = year+"-"+issue+"-"+page;
// 根据页码获取Article 存储栏目
File downPath = new File("C:\\Users\\wsh\\Downloads");
File[] listFiles = downPath.listFiles();
for (File file : listFiles) {
if(file.getName().contains(authors[0])) {
try {
FileUtils.copyFile(file, new File("D:\\webDriver\\"+journalName+"\\"+publisherID+".pdf"));
file.delete();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
}
}
}
}
driver.quit();
}
}
public void updateAbstrat(EntityManager entityManager,Article article) {
try {
WebDriver driver = null;
ChromeDriverService service = null;
String chromeDriverPath = "D:\\webDriver\\chromedriver.exe";
HashMap<String, Object> chromePrefs = new HashMap<String, Object>();
ChromeOptions chromeOptions = new ChromeOptions();
try {
chromePrefs.put("download.default_directory", "D:\\webDriver");
File chromeDriverFile = new File(chromeDriverPath);
System.setProperty("webdriver.chrome.driver",chromeDriverPath);
chromeOptions.setExperimentalOption("prefs", chromePrefs);
//设置为 headless 模式 (必须)
chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--disable-gpu");
chromeOptions.addArguments("--no-sandbox");
// 禁止弹出拦截
chromeOptions.addArguments("--disable-popup-blocking");
// 禁止默认浏览器检查
chromeOptions.addArguments("no-default-browser-check");
chromeOptions.addArguments("about:histograms");
chromeOptions.addArguments("about:cache");
chromeOptions.addArguments("--start-maximized");
//创建一个 ChromeDriver 接口
service = new ChromeDriverService.Builder().usingDriverExecutable(chromeDriverFile).usingAnyFreePort().build();
service.start();
} catch (IOException e1) {
e1.printStackTrace();
}
driver = new ChromeDriver(chromeOptions);
try {
driver.get("https://kns.cnki.net/kcms/detail/detail.aspx?sfield=FN&dbCode=CJFD&filename="+article.getId()+"&tableName=CJFD2000&url=");
Thread.sleep(1000);
List<WebElement> rowEles = driver.findElements(By.className("row"));
for (WebElement rowEle : rowEles) {
String content = rowEle.getText();
if(content.contains("摘要")) {
try {
WebElement moreEle = rowEle.findElement(By.id("ChDivSummaryMore"));
if(moreEle.isDisplayed()) moreEle.click();
} catch (Exception e) {
}
String abstractInfo = rowEle.findElement(By.id("ChDivSummary")).getAttribute("innerHTML")
.replace("<span id=\"ChDivSummary\" name=\"ChDivSummary\" class=\"abstract-text\">", "").replace("</span>", "");
article.setAbstractinfo(abstractInfo);
article.setAbstractinfoCn(abstractInfo);
article.setRemark1("cnki");
}
}
try {
entityManager.getTransaction().begin();
entityManager.persist(article);
entityManager.getTransaction().commit();
System.out.println(article.getId()+":cnki");
} catch (Exception e) {
entityManager.getTransaction().rollback();
}
} catch (Exception e) {
// TODO: handle exception
}
driver.manage().deleteAllCookies();
driver.quit();
} catch (Exception e) {
// TODO: handle exception
}
}
}