使用selenium工具爬取知网相关数据,思路:根据几个关键词搜索出相关的内容,然后爬取列表中所有论文的访问链接。
注意:直接爬取的链接是不能用的,需要自己拼接一下。具体看代码。新手,代码写的有点乱。勿喷。里面穿插了一些简单的对于关键词的分析。不喜勿喷,谢谢。
直接上代码
package com.test.demo.controller;
import org.apache.xmlbeans.impl.xb.xsdschema.All;
import org.checkerframework.checker.nullness.compatqual.NullableDecl;
import org.json.JSONObject;
import org.openqa.selenium.By;
import org.openqa.selenium.Keys;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.stereotype.Controller;
import org.springframework.util.ResourceUtils;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.ResponseBody;
import java.io.*;
import java.util.*;
import java.util.concurrent.TimeUnit;
/**
* 这个整体的过程是 1:爬取所有的论文详情连接 存进文本中
* 2:根据文本中的详情链接,分别开始采相应的数据
* 3:采到的相应的数据在做分析,将结果写进KeyWord.json文件中
*/
@Controller
@RequestMapping("/CNKISpidr")
public class CNKISpiderController {
/**
* 抓取所有的详情列表
* @throws Exception
*/
@RequestMapping(value = "/spider", method = RequestMethod.POST)
@ResponseBody
public Integer spiderCNKI(String themeKey,String Key,String abstractKey,String pageNum) throws Exception{
String themeStr = themeKey;
String keyStr = Key;
String abstractKeyStr = abstractKey;
// 创建一个list存放所有的详情的拼接成功的链接
List<String> reUrlList = new ArrayList<>();
// 存放出版年月的list
List<String> yearList = new ArrayList<>();
/* // 设置驱动的位置
System.setProperty("webdriver.chrome.driver",
"D:\\Google\\Chrome\\Application\\chromedriver.exe");*/
//WebDriver driver;
// driver=new ChromeDriver();
//调整高度
// ((ChromeDriver) driver).executeScript("window.scrollTo(0, document.body.scrollHeight);");
// 爬取详情链接
// List<String> yearListRe = new ArrayList<>();
initqueryAndGetData(themeStr,keyStr,abstractKeyStr); //
// Thread.sleep(10000);
// 根据详情链接爬取内容
spiderByUrl();
// 分析内容,然后将结果写进json文件中
// Thread.sleep(10000);
writeJson();
return 1;
}
// 将结果写进json文件中
private void writeJson() throws Exception{
// 创建存放数据的文本文件,以及读写数据的buffer
File path1 = new File(ResourceUtils.getURL("classpath:").getPath());
if (!path1.exists()) path1 = new File("");
// System.out.println("path:"+path1.getAbsolutePath());
File upload = new File(path1.getAbsolutePath(), "src/main/webapp/data");
if (!upload.exists()) upload.mkdirs();
// 1:创建一个文本文件,用来存放爬取的数据
String path = upload.getAbsolutePath() + "\\keyWord.json";
File file = new File(path);
if (!file.exists()) {
file.getParentFile().mkdirs();
}
file.createNewFile();
// 写所有信息的 true是追加,false是覆盖
FileWriter fileWriter = new FileWriter(file, false);
BufferedWriter bufferedWriter = new BufferedWriter(fileWriter);
//*********读取关键词的文本文件**********************************************************************************
File path2 = new File(ResourceUtils.getURL("classpath:").getPath());
if (!path2.exists()) path2 = new File("");
// System.out.println("path:"+path1.getAbsolutePath());
File upload2 = new File(path2.getAbsolutePath(), "src/main/webapp/data");
if (!upload2.exists()) upload2.mkdirs();
String pathKey = upload2.getAbsolutePath() + "\\text.txt"; // 因为是要分别获取每一篇的关键词,所以直接分析这个文本
// 路径文件
File filekey = new File(pathKey);
BufferedReader in = new BufferedReader(new FileReader(filekey));
String line = null;
//定义一个空字符串来接受读到的字符串
String str = "";
// 这个字符数组的长度不能写死
//String array[] = new String[1605];
List<String> array = new ArrayList<>();
//int i = 0;
//循环把读取到的字符赋给str
while ((line = in.readLine()) != null) {
if (line.contains("关键词")) {
str = line.substring(line.indexOf(":") + 1);
//array[i] = str;
array.add(str);
//i++;
}
}
System.out.println("array:"+array.size());
// 这个是存放所有关键字节点的 nodes
List<Map<String, Object>> nodesList = new ArrayList<Map<String, Object>>(); // list是map类型的
// 这个是存放节点之间的关系的 links
List<Map<String, Object>> nodesLinkList = new ArrayList<Map<String, Object>>();
// 存放所有的关键词
List<String> nodeIdList = new ArrayList<String>();
//
Map<String, Integer> countKeysMap = keysWordCount();
// 遍历 这里存放的是所有关键词的频次
/* for (String key:countKeysMap.keySet()){
System.out.println(key+" "+countKeysMap.get(key));
}*/
int count = 0;
// 点不录入重复的
for (int j = 0; j < array.size(); j++) { //遍历的是每一篇关键词串
String re[] = array.get(j).split(";");
count = count + re.length;
for (int m = 0; m < re.length; m++) { // 一篇论文中的关键词 有多个关键词
// 结点
// 这个map相当于存的是一篇论文中的关键词,以map的形式存储
// 节点还应该具备的属性有 节点的大小 symbolSize
// itemStyle: {
// color: 'red'
// }
// 和节点的颜色
for (String key : countKeysMap.keySet()) {
// 关键词
//String keyStr = re[m];
if ((key.trim()).equals(re[m].trim())) {
// 频次
Integer valueInt = countKeysMap.get(key);
if (valueInt == 1) {
//String colorStr = "{color:\"black\"}";
Map colorMap = new HashMap();
colorMap.put("color","#c23531");
Map nodesMap = new HashMap();
nodesMap.put("name", re[m].trim());
//nodesMap.put("value",1);
nodesMap.put("symbolSize", 3);
nodesMap.put("itemStyle",colorMap);
nodesList.add(nodesMap);
} else if (valueInt > 1 && valueInt <= 50) {
Map colorMap = new HashMap();
colorMap.put("color","#2f4554");
Map nodesMap = new HashMap();
nodesMap.put("name", re[m].trim());
//nodesMap.put("value",1);
nodesMap.put("symbolSize", 8);
nodesMap.put("itemStyle",colorMap);
nodesList.add(nodesMap);
} else if (valueInt > 50 && valueInt <= 100) {
Map colorMap = new HashMap();
colorMap.put("color","#61a0a8");
Map nodesMap = new HashMap();
nodesMap.put("name", re[m].trim());
//nodesMap.put("value",1);
nodesMap.put("symbolSize", 30);
nodesMap.put("itemStyle",colorMap);
nodesList.add(nodesMap);
} else if (valueInt > 100 && valueInt <= 300) {
Map colorMap = new HashMap();
colorMap.put("color","#d48265");
Map nodesMap = new HashMap();
nodesMap.put("name", re[m].trim());
//nodesMap.put("value",1);
nodesMap.put("symbolSize", 40);
nodesMap.put("itemStyle",colorMap);
nodesList.add(nodesMap);
} else if (valueInt > 300 && valueInt <= 500) {
Map colorMap = new HashMap();
colorMap.put("color","#91c7ae");
Map nodesMap = new HashMap();
nodesMap.put("name", re[m].trim());
//nodesMap.put("value",1);
nodesMap.put("symbolSize", 50);
nodesMap.put("itemStyle",colorMap);
nodesList.add(nodesMap);
} else {
Map colorMap = new HashMap();
colorMap.put("color","#749f83");
Map nodesMap = new HashMap();
nodesMap.put("name", re[m].trim());
//nodesMap.put("value",1);
nodesMap.put("symbolSize", 60);
nodesMap.put("itemStyle",colorMap);
nodesList.add(nodesMap);
}
}
}
}
}
// 对关键词去重,针对重复的元素,只存一次
Set<Map<String, Object>> keysSet = new HashSet<Map<String, Object>>();
keysSet.addAll(nodesList);
System.out.println("count :"+count);
System.out.println("所有节点数:"+nodesList.size());
System.out.println("去重之后的节点数:"+keysSet.size());
// 关系照常录入
for (int j = 0; j < array.size(); j++) {
String re[] = array.get(j).split(";");
for (int m = 0; m < re.length - 1; m++) { // 一篇论文中的关键词 有三个关键词
// 关系
Map linksMap = new HashMap();// 第一次 3个 第二次 3个
// linksMap.put("source",count); //0 0 0 完 3 3 3
// linksMap.put("target",count+m); //0 1 2 完 3 4 5
linksMap.put("source", re[m].trim()); //0 0 0 完 3 3 3
linksMap.put("target", re[m + 1].trim()); //0 1 2 完 3 4 5
nodesLinkList.add(linksMap);
}
}
System.out.println("边数:"+nodesLinkList.size());
String jsonObject1 = JSONObject.valueToString(JSONObject.wrap(keysSet));
String jsonObject2 = JSONObject.valueToString(JSONObject.wrap(nodesLinkList));
bufferedWriter.write("{\"nodes\":" + jsonObject1 + ",\"links\":" + jsonObject2 + "}");
bufferedWriter.flush();
// 关闭
bufferedWriter.close();
fileWriter.close();
}
private Map<String, Integer> keysWordCount() throws Exception {
File path1 = new File(ResourceUtils.getURL("classpath:").getPath());
if (!path1.exists()) path1 = new File("");
// System.out.println("path:"+path1.getAbsolutePath());
File upload = new File(path1.getAbsolutePath(), "src/main/webapp/data");
if (!upload.exists()) upload.mkdirs();
String pathKey = upload.getAbsolutePath() + "\\textKey.txt";
// 路径文件
File filekey = new File(pathKey);
FileReader fileReader = new FileReader(filekey);
BufferedReader bufferedReader = new BufferedReader(fileReader);
// 统计词频
String file = "";
String s;
while ((s = bufferedReader.readLine()) != null) {
file += s; // 读取整篇文章,存入String类的file中
}
// System.out.println(file);
StringTokenizer st = new StringTokenizer(file, ";"); //用于切分字符串
Map<String, Integer> hm = new HashMap<String, Integer>();
while (st.hasMoreTokens()) {
String word = st.nextToken();
if (hm.get(word) != null) {
int value = ((Integer) hm.get(word)).intValue();
value++;
hm.put(word, new Integer(value));
} else {
hm.put(word, new Integer(1));
}
}
//1、按顺序保存map中的元素,使用LinkedList类型
List<Map.Entry<String, Integer>> keyList = new LinkedList<Map.Entry<String, Integer>>(hm.entrySet());
//System.out.println("一共有 "+keyList.size()+" 个关键词");
//2、按照自定义的规则排序
Collections.sort(keyList, new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1,
Map.Entry<String, Integer> o2) {
if (o2.getValue().compareTo(o1.getValue()) > 0) {
return 1;
} else if (o2.getValue().compareTo(o1.getValue()) < 0) {
return -1;
} else {
return 0;
}
}
});
//3、将LinkedList按照排序好的结果,存入到HashMap中
HashMap<String, Integer> result = new LinkedHashMap<>();
for (Map.Entry<String, Integer> entry : keyList) {
result.put(entry.getKey(), entry.getValue());
}
Map<String, Integer> resultMap = new HashMap<>();
// 将频次大于等于2的取出来
for (String key : result.keySet()) {
if (hm.get(key) >= 2) {
resultMap.put(key, result.get(key));
}
}
return result;
}
/**
* 爬取详情链接
* 爬取链接的时候,要 1:存详情链接,以便后面爬取内容 2:存储出版年月,以便后面一一对应,写进文本中去。
* @param themeStr
* @param keyStr
* @param abstractKeyStr
* @throws Exception
*/
private void initqueryAndGetData(String themeStr, String keyStr, String abstractKeyStr) throws Exception {
// 设置驱动的位置
System.setProperty("webdriver.chrome.driver",
"D:\\Google\\Chrome\\Application\\chromedriver.exe");
WebDriver driver;
driver=new ChromeDriver();
((ChromeDriver) driver).executeScript("window.scrollTo(0, document.body.scrollHeight);");
// 创建存放数据的文本文件,以及读写数据的buffer
File path1 = new File(ResourceUtils.getURL("classpath:").getPath());
if (!path1.exists()) path1 = new File("");
// System.out.println("path:"+path1.getAbsolutePath());
File upload = new File(path1.getAbsolutePath(), "src/main/webapp/data");
if (!upload.exists()) upload.mkdirs();
// 3:创建一个文本文件存放所有的详情url
String pathDetail = upload.getAbsolutePath() + "\\detailUrl.txt"; // 这里最终是detailUrl.txt
File fileDetail = new File(pathDetail);
if (!fileDetail.exists()) {
fileDetail.getParentFile().mkdirs();
}
fileDetail.createNewFile();
// 4:创建一个文本专门存年月日的
// 3:创建一个文本文件存放所有的详情url
String yearText = upload.getAbsolutePath() + "\\yearText.txt"; // 这里最终是detailUrl.txt
File YearTextFile = new File(yearText);
if (!YearTextFile.exists()) {
YearTextFile.getParentFile().mkdirs();
}
YearTextFile.createNewFile();
// 写url的
FileWriter fileWriterDetail = new FileWriter(fileDetail, false);
BufferedWriter bufferedWriterDetail = new BufferedWriter(fileWriterDetail);
// 写url的
FileWriter fileWriterYear = new FileWriter(YearTextFile, false);
BufferedWriter bufferedWriterYear = new BufferedWriter(fileWriterYear);
// 存放出版年月的list
List<String> yearList = new ArrayList<>();
List<String> reUrlList = new ArrayList<>();
// 定义入口网址 http://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCDB&crossDbcodes=CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD
String websiteUel = "http://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCDB&crossDbcodes=CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD";
driver.get(websiteUel);
// 题目搜索
WebElement theme = driver.findElement(By.name("txt_1_value1"));//driver.findElementByName("txt_1_value1");
//theme.sendKeys("淤地坝"); //主题
theme.sendKeys(themeStr);
// 关键字搜索
WebElement keyWordChoice = driver.findElement(By.xpath("//*[@id=\"txt_2_logical\"]/option[2]"));
keyWordChoice.click();
WebElement keyWord = driver.findElement(By.name("txt_2_value1"));//driver.findElementByName("txt_2_value1");
//keyWord.sendKeys("淤地坝"); // 关键字
keyWord.sendKeys(keyStr); // 关键字
// 点击+
WebElement add = driver.findElement(By.xpath("//*[@id=\"txt_1\"]/td[1]/a[1]"));
add.click();
// 摘要搜索
WebElement zhaiYaoChioce = driver.findElement(By.xpath("//*[@id=\"txt_3_sel\"]/option[4]"));
zhaiYaoChioce.click();
WebElement zhaiyaoHuo = driver.findElement(By.xpath("//*[@id=\"txt_3_logical\"]/option[2]"));
zhaiyaoHuo.click();
WebElement zhaiYaoValue = driver.findElement(By.name("txt_3_value1"));//driver.findElementByName("txt_3_value1");
// zhaiYaoValue.sendKeys("淤地坝"); // 摘要
zhaiYaoValue.sendKeys(abstractKeyStr); // 摘要
zhaiYaoValue.sendKeys(Keys.ENTER);
// Thread.sleep(500);
// 定位iframe
WebElement iframe = driver.findElement(By.id("iframeResult"));
((ChromeDriver) driver).switchTo().frame(iframe);
// //*[@id="id_grid_display_num"]/a[3]
Thread.sleep(2000);
WebElement fiveB = driver.findElement(By.id("id_grid_display_num"));
// System.out.printf(fiveB.getText());
List<WebElement> fiveBtn = fiveB.findElements(By.tagName("a"));
fiveBtn.get(2).click();
// 获取总页数
WebElement allPageNumEle = driver.findElement(By.xpath("//*[@id=\"J_ORDER\"]/tbody/tr[2]/td/table/tbody/tr/td[2]/div/span[1]"));
String allPageNumStr = allPageNumEle.getText();
allPageNumStr = allPageNumStr.substring(allPageNumStr.indexOf("/")+1,allPageNumStr.length());
Integer AllPage = Integer.parseInt(allPageNumStr);
System.out.println(AllPage);
// 做测试 只获取两页数据
for (int i = 0; i < AllPage-1; i++) {
//获取窗口
String now_handle = driver.getWindowHandle();
Set<String> all_handles = driver.getWindowHandles();
//判断窗口是否一致
for (String handle : all_handles) {
if (handle != now_handle) {
driver.switchTo().window(handle);
((ChromeDriver) driver).switchTo().frame(iframe);
//获取iframe元素内容直至tr
List<WebElement> tb = driver.findElements(By.xpath("//*[@id=\"ctl00\"]/table/tbody/tr[2]"));
for (WebElement t : tb) {
List<WebElement> tbod = t.findElements(By.tagName("tbody"));
for (WebElement tr : tbod) {
List<WebElement> trA = tr.findElements(By.tagName("tr")); //获取所有的tr
trA.remove(0);
// for (WebElement tds : td) { // 遍历td a标签在td[1]里面
for (int j = 0; j < trA.size(); j++) { //遍历tr
// td里面包含一行的数据
List<WebElement> tdss = trA.get(j).findElements(By.tagName("td")); //获取所有的td 而我们只需要前几个td
// 标题
String title = tdss.get(1).getText();
WebElement url = driver.findElement(By.xpath("//*[@id=\"ctl00\"]/table/tbody/tr[2]/td/table/tbody/tr[" + (j + 2) + "]/td[2]/a"));
String urlStr = url.getAttribute("href");
//System.out.println(urlStr);
String reUrl1 = urlStr.replace("kns/", "KCMS/");
// 将详情的 url写进文本文件中
bufferedWriterDetail.write(reUrl1+"\n");
reUrlList.add(reUrl1);
// 这里是取出出版年月 有多少篇论文就有多少个出版年月
String year = tdss.get(4).getText();
System.out.println(year);
bufferedWriterYear.write(year+"\n");
bufferedWriterYear.flush();
//yearList.add(year);
}
}
}
}
}
WebElement nextBtn = driver.findElement(By.id("Page_next"));
try {
if (i == 0){ // 14的倍数
// System.out.println(i+" 不是");
//Thread.sleep(90000);
// WebElement nextBtn = driver.findElement(By.id("Page_next"));
nextBtn.click();
}else if (i % 14 == 0 ){
// System.out.println(i+" 是");
Thread.sleep(90000);
nextBtn.click();
}else {
// System.out.println(i+" 不是");
// Thread.sleep(90000);
// WebElement nextBtn = driver.findElement(By.id("Page_next"));
nextBtn.click();
}
}catch (NoSuchElementException e){
System.out.println("element is noe exist");
}
}
bufferedWriterDetail.flush();
fileWriterDetail.close();
bufferedWriterDetail.close();
bufferedWriterYear.close();
fileWriterYear.close();
// return yearList;
}
/**
* 通过详情url来爬取内容
* 注意:读取两个文本文件(存放详情链接的,存放出版年月的)
* 存储三个文本文件(所有的内容文本,只有关键词的文本,关键词语出版年月一一对应的文本)
* @throws Exception
* @param
* @param
*/
/* @RequestMapping("/spiderByUrl")
@ResponseBody*/
public void spiderByUrl() throws Exception{
// 声明谷歌浏览器
System.setProperty("webdriver.chrome.driver",
"D:\\Google\\Chrome\\Application\\chromedriver.exe");
WebDriver chromdriver;
chromdriver = new ChromeDriver();
/
//声明火狐浏览器
WebDriver firedriver;
//火狐的安装位置
System.setProperty("webdriver.firefox.bin","D:\\fireBrower\\firefox.exe");
//加载驱动
System.setProperty("webdriver.firefox.marionette","D:\\fireBrower\\geckodriver.exe");
firedriver=new FirefoxDriver();
//firedriver.get("http://sms.webchinese.cn/api.shtml");
///
File path1 = new File(ResourceUtils.getURL("classpath:").getPath());
if (!path1.exists()) path1 = new File("");
// System.out.println("path:"+path1.getAbsolutePath());
File upload = new File(path1.getAbsolutePath(), "src/main/webapp/data");
if (!upload.exists()) upload.mkdirs();
String pathKey = upload.getAbsolutePath() + "\\detailUrl.txt"; // 这里最终是 detailUrl.txt
// 路径文件
File filekey = new File(pathKey);
BufferedReader in = new BufferedReader(new FileReader(filekey));
String pathYearText = upload.getAbsolutePath() + "\\yearText.txt";
File fileYearText = new File(pathYearText);
BufferedReader inYear = new BufferedReader(new FileReader(fileYearText));
List<String> yearListRe = new ArrayList<>();
String lineYear = null;
while ((lineYear = inYear.readLine()) != null){
yearListRe.add(lineYear);
}
String line = null;
int i = 0;
List<String> reUrlList = new ArrayList<>();
//循环把读取到的字符赋给str
while ((line = in.readLine()) != null) {
//System.out.println(line);
reUrlList.add(line); //读取详情列表中的每一行
}
//
String pathText = upload.getAbsolutePath() + "\\text.txt"; // 最终是 text.txt
File fileText = new File(pathText);
FileWriter fileWriter = new FileWriter(fileText, false);
BufferedWriter bufferedWriter = new BufferedWriter(fileWriter);
//
String pathKeyTest = upload.getAbsolutePath() + "\\textKey.txt"; // 最终是 textKey.txt
File fileKeyTest = new File(pathKeyTest);
FileWriter fileWriterKey = new FileWriter(fileKeyTest, false);
BufferedWriter bufferedWriterKey = new BufferedWriter(fileWriterKey);
String pathKeyTest2 = upload.getAbsolutePath() + "\\textKeyAndYear.txt"; // 最终是 textKey.txt
File fileKeyTest2 = new File(pathKeyTest2);
FileWriter fileWriterKey2 = new FileWriter(fileKeyTest2, false);
BufferedWriter bufferedWriterKey2 = new BufferedWriter(fileWriterKey2);
int CountNumber = 1;
for (int j = 0; j < reUrlList.size(); j++) {
if ( j % 2 == 0 ){ // 偶数
if (j == 0){
Thread.sleep(1000);
chromdriver.get(reUrlList.get(j));
// chromdriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
// System.out.println(chromdriver.getWindowHandle());
//Thread.sleep(500);
//chromdriver.navigate().refresh();
System.out.println(CountNumber++);
String TitleStr = "";
String AuthorStr = "";
String DepartStr = "";
String ZhaiyaoStr = "";
String GuanjianciStr = "";
String baseMoneyStr = "";
String classNumStr = "";
String GuanjianCi = "";
String shaiXuanKeyWord = "";
// 数据的爬取 开始
// 获取标题【肯定有】 //*[@id="mainArea"]/div[3]/div[1]/h2 //*[@id="mainArea"]/div[2]/div[1]/h2 //*[@id="mainArea"]
WebElement eleTitleMain = chromdriver.findElement(By.id("mainArea"));
WebElement eleTitleCon = eleTitleMain.findElement(By.className("wxmain"));
WebElement eleTitleTitle = eleTitleCon.findElement(By.className("wxTitle"));
WebElement eleTitleText = eleTitleTitle.findElement(By.tagName("h2"));
// 标题
TitleStr = eleTitleText.getText();
// 作者【可能没有,判空】 //*[@id="mainArea"]/div[3]/div[1]/div[1] //*[@id="mainArea"]/div[3]/div[1]/div[1]/span[1]
WebElement eleAuthor = chromdriver.findElement(By.id("mainArea"));
WebElement eleAuthorCon = eleAuthor.findElement(By.className("wxmain"));
WebElement eleAuthorTitle = eleAuthorCon.findElement(By.className("wxTitle"));
WebElement eleAuthorDiv = eleAuthorTitle.findElement(By.className("author"));
List<WebElement> authors = eleAuthorDiv.findElements(By.tagName("span"));
// 作者
if (authors.size() != 0){
for (int q = 1; q < authors.size(); q++) {
AuthorStr += authors.get(q).getText() + " ";
}
}else {
AuthorStr += "为空";
}
// 单位【可能没有,判空】 //*[@id="mainArea"]/div[3]/div[1]/div[2]
WebElement eleDepart = chromdriver.findElement(By.id("mainArea"));
WebElement eleDepartCon = eleDepart.findElement(By.className("wxmain"));
WebElement eleDepartTitle = eleDepartCon.findElement(By.className("wxTitle"));
WebElement eleDepartDiv = eleDepartTitle.findElement(By.className("orgn"));
List<WebElement> eleDeparts = eleDepartDiv.findElements(By.tagName("span"));
if (eleDeparts.size() != 0){
for (int e = 1; e < eleDeparts.size(); e++) {
DepartStr += eleDeparts.get(e).getText() + " ";
}
}else {
DepartStr += "为空";
}
// 摘要【可能没有,判空】 //*[@id="mainArea"]/div[2]/div[2]/div[1]/p[1] 这里取所有
String mainContent = "";
WebElement eleZhaiyao = chromdriver.findElement(By.id("mainArea"));
WebElement eleZhaiyaoCon = eleZhaiyao.findElement(By.className("wxmain"));
WebElement eleZhaiyaoTitle = eleZhaiyaoCon.findElement(By.className("wxInfo"));
WebElement eleZhaiyaoDiv = eleZhaiyaoTitle.findElement(By.className("wxBaseinfo"));
List<WebElement> spanEle = eleZhaiyaoDiv.findElements(By.tagName("p"));
if (spanEle.size() != 0){
for (int r = 0; r < spanEle.size(); r++) {
if (spanEle.get(r).getText().equals("")) {
} else {
if (spanEle.get(r).getText().contains("手机阅读本文") || spanEle.get(r).getText().contains("下载安装手机APP") ||
spanEle.get(r).getText().contains("扫码同步阅读本文") || spanEle.get(r).getText().contains("文内图片") ||
spanEle.get(r).getText().contains("图1") || spanEle.get(r).getText().contains("图 2") ||
spanEle.get(r).getText().contains("图3") || spanEle.get(r).getText().contains("图4") ||
spanEle.get(r).getText().contains("图5") || spanEle.get(r).getText().contains("图6") ||
spanEle.get(r).getText().contains("图 7") || spanEle.get(r).getText().contains("图 8") ||
spanEle.get(r).getText().contains("图 9") || spanEle.get(r).getText().contains("图10")) {
} else {
shaiXuanKeyWord = spanEle.get(r).getText();
if (shaiXuanKeyWord.contains("关键词")) {
// 存储含有关键字的论文和出版年月
bufferedWriterKey2.write(shaiXuanKeyWord+"\n");
bufferedWriterKey2.write(yearListRe.get(j)+"\n");
bufferedWriterKey2.flush();
GuanjianciStr += shaiXuanKeyWord;
}
mainContent += shaiXuanKeyWord + "\n";
}
}
}
}else {
GuanjianciStr += "为空";
mainContent += "为空";
}
// 截取关键字中的冒号后面的内容
String GuanjianciStrRe = GuanjianciStr.substring(GuanjianciStr.indexOf(":") + 1);
GuanjianCi = GuanjianciStrRe.replace(" ", "");
// 抓取论文的下载链接
String paperHref = "";
WebElement downEle = chromdriver.findElement(By.id("DownLoadParts"));
List<WebElement> downList = downEle.findElements(By.tagName("a"));
for (int k=0;k<downList.size();k++){
//System.out.println(downList.get(k).getText());
String string = downList.get(k).getText();
if (string.equals("CAJ下载")){
paperHref = downList.get(k).getAttribute("href");
}else if (string.equals("整本下载")){
paperHref = downList.get(k).getAttribute("href");
}else if (string.length() == 0){
paperHref = "为空";
}
}
//写进文本文件中
// chromdriver.navigate().refresh();
bufferedWriter.write("标题:" + TitleStr + "\n作者:" + AuthorStr + "\n单位:" + DepartStr + "\n" + mainContent +"出版年月:"+ yearListRe.get(j)+"\n" + "下载链接:"+paperHref+ "\n\n");
bufferedWriter.flush();
// 将关键词写进文本文件中
bufferedWriterKey.write(GuanjianCi);
bufferedWriterKey.flush();
// 数据的爬取 结束/
//driver.navigate().back();
//Thread.sleep(1000);
// 200的倍数
}else if ( j % 200 == 0){
Thread.sleep(60000);
chromdriver.get(reUrlList.get(j));
// chromdriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
// System.out.println(chromdriver.getWindowHandle());
//Thread.sleep(500);
//chromdriver.navigate().refresh();
System.out.println(CountNumber++);
String TitleStr = "";
String AuthorStr = "";
String DepartStr = "";
String ZhaiyaoStr = "";
String GuanjianciStr = "";
String baseMoneyStr = "";
String classNumStr = "";
String GuanjianCi = "";
String shaiXuanKeyWord = "";
// 数据的爬取 开始
// 获取标题【肯定有】 //*[@id="mainArea"]/div[3]/div[1]/h2 //*[@id="mainArea"]/div[2]/div[1]/h2 //*[@id="mainArea"]
WebElement eleTitleMain = chromdriver.findElement(By.id("mainArea"));
WebElement eleTitleCon = eleTitleMain.findElement(By.className("wxmain"));
WebElement eleTitleTitle = eleTitleCon.findElement(By.className("wxTitle"));
WebElement eleTitleText = eleTitleTitle.findElement(By.tagName("h2"));
// 标题
TitleStr = eleTitleText.getText();
// 作者【可能没有,判空】 //*[@id="mainArea"]/div[3]/div[1]/div[1] //*[@id="mainArea"]/div[3]/div[1]/div[1]/span[1]
WebElement eleAuthor = chromdriver.findElement(By.id("mainArea"));
WebElement eleAuthorCon = eleAuthor.findElement(By.className("wxmain"));
WebElement eleAuthorTitle = eleAuthorCon.findElement(By.className("wxTitle"));
WebElement eleAuthorDiv = eleAuthorTitle.findElement(By.className("author"));
List<WebElement> authors = eleAuthorDiv.findElements(By.tagName("span"));
// 作者
if (authors.size() != 0){
for (int q = 1; q < authors.size(); q++) {
AuthorStr += authors.get(q).getText() + " ";
}
}else {
AuthorStr += "为空";
}
// 单位【可能没有,判空】 //*[@id="mainArea"]/div[3]/div[1]/div[2]
WebElement eleDepart = chromdriver.findElement(By.id("mainArea"));
WebElement eleDepartCon = eleDepart.findElement(By.className("wxmain"));
WebElement eleDepartTitle = eleDepartCon.findElement(By.className("wxTitle"));
WebElement eleDepartDiv = eleDepartTitle.findElement(By.className("orgn"));
List<WebElement> eleDeparts = eleDepartDiv.findElements(By.tagName("span"));
if (eleDeparts.size() != 0){
for (int e = 1; e < eleDeparts.size(); e++) {
DepartStr += eleDeparts.get(e).getText() + " ";
}
}else {
DepartStr += "为空";
}
// 摘要【可能没有,判空】 //*[@id="mainArea"]/div[2]/div[2]/div[1]/p[1] 这里取所有
String mainContent = "";
WebElement eleZhaiyao = chromdriver.findElement(By.id("mainArea"));
WebElement eleZhaiyaoCon = eleZhaiyao.findElement(By.className("wxmain"));
WebElement eleZhaiyaoTitle = eleZhaiyaoCon.findElement(By.className("wxInfo"));
WebElement eleZhaiyaoDiv = eleZhaiyaoTitle.findElement(By.className("wxBaseinfo"));
List<WebElement> spanEle = eleZhaiyaoDiv.findElements(By.tagName("p"));
if (spanEle.size() != 0){
for (int r = 0; r < spanEle.size(); r++) {
if (spanEle.get(r).getText().equals("")) {
} else {
if (spanEle.get(r).getText().contains("手机阅读本文") || spanEle.get(r).getText().contains("下载安装手机APP") ||
spanEle.get(r).getText().contains("扫码同步阅读本文") || spanEle.get(r).getText().contains("文内图片") ||
spanEle.get(r).getText().contains("图1") || spanEle.get(r).getText().contains("图 2") ||
spanEle.get(r).getText().contains("图3") || spanEle.get(r).getText().contains("图4") ||
spanEle.get(r).getText().contains("图5") || spanEle.get(r).getText().contains("图6") ||
spanEle.get(r).getText().contains("图 7") || spanEle.get(r).getText().contains("图 8") ||
spanEle.get(r).getText().contains("图 9") || spanEle.get(r).getText().contains("图10")) {
} else {
shaiXuanKeyWord = spanEle.get(r).getText();
if (shaiXuanKeyWord.contains("关键词")) {
// 存储含有关键字的论文和出版年月
bufferedWriterKey2.write(shaiXuanKeyWord+"\n");
bufferedWriterKey2.write(yearListRe.get(j)+"\n");
bufferedWriterKey2.flush();
GuanjianciStr += shaiXuanKeyWord;
}
mainContent += shaiXuanKeyWord + "\n";
}
}
}
}else {
GuanjianciStr += "为空";
mainContent += "为空";
}
// 截取关键字中的冒号后面的内容
String GuanjianciStrRe = GuanjianciStr.substring(GuanjianciStr.indexOf(":") + 1);
GuanjianCi = GuanjianciStrRe.replace(" ", "");
// 抓取论文的下载链接
String paperHref = "";
WebElement downEle = chromdriver.findElement(By.id("DownLoadParts"));
List<WebElement> downList = downEle.findElements(By.tagName("a"));
for (int k=0;k<downList.size();k++){
//System.out.println(downList.get(k).getText());
String string = downList.get(k).getText();
if (string.equals("CAJ下载")){
paperHref = downList.get(k).getAttribute("href");
}else if (string.equals("整本下载")){
paperHref = downList.get(k).getAttribute("href");
}else if (string.length() == 0){
paperHref = "为空";
}
}
//写进文本文件中
// chromdriver.navigate().refresh();
bufferedWriter.write("标题:" + TitleStr + "\n作者:" + AuthorStr + "\n单位:" + DepartStr + "\n" + mainContent +"出版年月:"+ yearListRe.get(j)+"\n" + "下载链接:"+paperHref+ "\n\n");
bufferedWriter.flush();
// 将关键词写进文本文件中
bufferedWriterKey.write(GuanjianCi);
bufferedWriterKey.flush();
// 数据的爬取 结束/
//driver.navigate().back();
//Thread.sleep(1000);
}else{
Thread.sleep(1000);
chromdriver.get(reUrlList.get(j));
// chromdriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
// System.out.println(chromdriver.getWindowHandle());
//Thread.sleep(500);
//chromdriver.navigate().refresh();
System.out.println(CountNumber++);
String TitleStr = "";
String AuthorStr = "";
String DepartStr = "";
String ZhaiyaoStr = "";
String GuanjianciStr = "";
String baseMoneyStr = "";
String classNumStr = "";
String GuanjianCi = "";
String shaiXuanKeyWord = "";
// 数据的爬取 开始
// 获取标题【肯定有】 //*[@id="mainArea"]/div[3]/div[1]/h2 //*[@id="mainArea"]/div[2]/div[1]/h2 //*[@id="mainArea"]
WebElement eleTitleMain = chromdriver.findElement(By.id("mainArea"));
WebElement eleTitleCon = eleTitleMain.findElement(By.className("wxmain"));
WebElement eleTitleTitle = eleTitleCon.findElement(By.className("wxTitle"));
WebElement eleTitleText = eleTitleTitle.findElement(By.tagName("h2"));
// 标题
TitleStr = eleTitleText.getText();
// 作者【可能没有,判空】 //*[@id="mainArea"]/div[3]/div[1]/div[1] //*[@id="mainArea"]/div[3]/div[1]/div[1]/span[1]
WebElement eleAuthor = chromdriver.findElement(By.id("mainArea"));
WebElement eleAuthorCon = eleAuthor.findElement(By.className("wxmain"));
WebElement eleAuthorTitle = eleAuthorCon.findElement(By.className("wxTitle"));
WebElement eleAuthorDiv = eleAuthorTitle.findElement(By.className("author"));
List<WebElement> authors = eleAuthorDiv.findElements(By.tagName("span"));
// 作者
if (authors.size() != 0){
for (int q = 1; q < authors.size(); q++) {
AuthorStr += authors.get(q).getText() + " ";
}
}else {
AuthorStr += "为空";
}
// 单位【可能没有,判空】 //*[@id="mainArea"]/div[3]/div[1]/div[2]
WebElement eleDepart = chromdriver.findElement(By.id("mainArea"));
WebElement eleDepartCon = eleDepart.findElement(By.className("wxmain"));
WebElement eleDepartTitle = eleDepartCon.findElement(By.className("wxTitle"));
WebElement eleDepartDiv = eleDepartTitle.findElement(By.className("orgn"));
List<WebElement> eleDeparts = eleDepartDiv.findElements(By.tagName("span"));
if (eleDeparts.size() != 0){
for (int e = 1; e < eleDeparts.size(); e++) {
DepartStr += eleDeparts.get(e).getText() + " ";
}
}else {
DepartStr += "为空";
}
// 摘要【可能没有,判空】 //*[@id="mainArea"]/div[2]/div[2]/div[1]/p[1] 这里取所有
String mainContent = "";
WebElement eleZhaiyao = chromdriver.findElement(By.id("mainArea"));
WebElement eleZhaiyaoCon = eleZhaiyao.findElement(By.className("wxmain"));
WebElement eleZhaiyaoTitle = eleZhaiyaoCon.findElement(By.className("wxInfo"));
WebElement eleZhaiyaoDiv = eleZhaiyaoTitle.findElement(By.className("wxBaseinfo"));
List<WebElement> spanEle = eleZhaiyaoDiv.findElements(By.tagName("p"));
if (spanEle.size() != 0){
for (int r = 0; r < spanEle.size(); r++) {
if (spanEle.get(r).getText().equals("")) {
} else {
if (spanEle.get(r).getText().contains("手机阅读本文") || spanEle.get(r).getText().contains("下载安装手机APP") ||
spanEle.get(r).getText().contains("扫码同步阅读本文") || spanEle.get(r).getText().contains("文内图片") ||
spanEle.get(r).getText().contains("图1") || spanEle.get(r).getText().contains("图 2") ||
spanEle.get(r).getText().contains("图3") || spanEle.get(r).getText().contains("图4") ||
spanEle.get(r).getText().contains("图5") || spanEle.get(r).getText().contains("图6") ||
spanEle.get(r).getText().contains("图 7") || spanEle.get(r).getText().contains("图 8") ||
spanEle.get(r).getText().contains("图 9") || spanEle.get(r).getText().contains("图10")) {
} else {
shaiXuanKeyWord = spanEle.get(r).getText();
if (shaiXuanKeyWord.contains("关键词")) {
// 存储含有关键字的论文和出版年月
bufferedWriterKey2.write(shaiXuanKeyWord+"\n");
bufferedWriterKey2.write(yearListRe.get(j)+"\n");
bufferedWriterKey2.flush();
GuanjianciStr += shaiXuanKeyWord;
}
mainContent += shaiXuanKeyWord + "\n";
}
}
}
}else {
GuanjianciStr += "为空";
mainContent += "为空";
}
// 截取关键字中的冒号后面的内容
String GuanjianciStrRe = GuanjianciStr.substring(GuanjianciStr.indexOf(":") + 1);
GuanjianCi = GuanjianciStrRe.replace(" ", "");
// 抓取论文的下载链接
String paperHref = "";
WebElement downEle = chromdriver.findElement(By.id("DownLoadParts"));
List<WebElement> downList = downEle.findElements(By.tagName("a"));
for (int k=0;k<downList.size();k++){
//System.out.println(downList.get(k).getText());
String string = downList.get(k).getText();
if (string.equals("CAJ下载")){
paperHref = downList.get(k).getAttribute("href");
}else if (string.equals("整本下载")){
paperHref = downList.get(k).getAttribute("href");
}else if (string.length() == 0){
paperHref = "为空";
}
}
//写进文本文件中
// chromdriver.navigate().refresh();
bufferedWriter.write("标题:" + TitleStr + "\n作者:" + AuthorStr + "\n单位:" + DepartStr + "\n" + mainContent +"出版年月:"+ yearListRe.get(j)+"\n" + "下载链接:"+paperHref+ "\n\n");
bufferedWriter.flush();
// 将关键词写进文本文件中
bufferedWriterKey.write(GuanjianCi);
bufferedWriterKey.flush();
// 数据的爬取 结束/
//driver.navigate().back();
//Thread.sleep(1000);
}
}else {
Thread.sleep(1000);
firedriver.get(reUrlList.get(j));
// firedriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
// System.out.println(chromdriver.getWindowHandle());
// Thread.sleep(500);
// firedriver.navigate().refresh();
System.out.println(CountNumber++);
String TitleStr = "";
String AuthorStr = "";
String DepartStr = "";
String ZhaiyaoStr = "";
String GuanjianciStr = "";
String baseMoneyStr = "";
String classNumStr = "";
String GuanjianCi = "";
String shaiXuanKeyWord = "";
// 数据的爬取 开始
// 获取标题【肯定有】 //*[@id="mainArea"]/div[3]/div[1]/h2 //*[@id="mainArea"]/div[2]/div[1]/h2 //*[@id="mainArea"]
WebElement eleTitleMain = firedriver.findElement(By.id("mainArea"));
WebElement eleTitleCon = eleTitleMain.findElement(By.className("wxmain"));
WebElement eleTitleTitle = eleTitleMain.findElement(By.className("wxTitle"));
WebElement eleTitleText = eleTitleTitle.findElement(By.tagName("h2"));
// 标题
TitleStr = eleTitleText.getText();
// 作者【可能没有,判空】 //*[@id="mainArea"]/div[3]/div[1]/div[1] //*[@id="mainArea"]/div[3]/div[1]/div[1]/span[1]
WebElement eleAuthor = firedriver.findElement(By.id("mainArea"));
WebElement eleAuthorCon = eleAuthor.findElement(By.className("wxmain"));
WebElement eleAuthorTitle = eleAuthorCon.findElement(By.className("wxTitle"));
WebElement eleAuthorDiv = eleAuthorTitle.findElement(By.className("author"));
List<WebElement> authors = eleAuthorDiv.findElements(By.tagName("span"));
// 作者
if (authors.size() != 0){
for (int q = 1; q < authors.size(); q++) {
AuthorStr += authors.get(q).getText() + " ";
}
}else {
AuthorStr += "为空";
}
// 单位【可能没有,判空】 //*[@id="mainArea"]/div[3]/div[1]/div[2]
// //*[@id="mainArea"]/div[3]/div[1]/div[2] //*[@id="mainArea"]/div[3]/div[1]/div[2]
// WebElement eleDepart = driver.findElement(By.xpath("//*[@id=\"mainArea\"]"));
WebElement eleDepart = firedriver.findElement(By.id("mainArea"));
WebElement eleDepartCon = eleDepart.findElement(By.className("wxmain"));
WebElement eleDepartTitle = eleDepartCon.findElement(By.className("wxTitle"));
WebElement eleDepartDiv = eleDepartTitle.findElement(By.className("orgn"));
List<WebElement> eleDeparts = eleDepartDiv.findElements(By.tagName("span"));
if (eleDeparts.size() != 0){
for (int e = 1; e < eleDeparts.size(); e++) {
DepartStr += eleDeparts.get(e).getText() + " ";
}
}else {
DepartStr += "为空";
}
// 摘要【可能没有,判空】 //*[@id="mainArea"]/div[2]/div[2]/div[1]/p[1] 这里取所有
// 用一个list来装这些信息
//List<String> mainContent = new ArrayList<>();
String mainContent = "";
// WebElement eleZhaiyao = driver.findElement(By.xpath("//*[@id=\"mainArea\"]"));
WebElement eleZhaiyao = firedriver.findElement(By.id("mainArea"));
WebElement eleZhaiyaoCon = eleZhaiyao.findElement(By.className("wxmain"));
WebElement eleZhaiyaoTitle = eleZhaiyaoCon.findElement(By.className("wxInfo"));
WebElement eleZhaiyaoDiv = eleZhaiyaoTitle.findElement(By.className("wxBaseinfo"));
List<WebElement> spanEle = eleZhaiyaoDiv.findElements(By.tagName("p"));
if (spanEle.size() != 0){
//String str = "[0-9]{1}";
for (int r = 0; r < spanEle.size(); r++) {
//System.out.println(spanEle.get(r).getText());
if (spanEle.get(r).getText().equals("")) {
} else {
if (spanEle.get(r).getText().contains("手机阅读本文") || spanEle.get(r).getText().contains("下载安装手机APP") ||
spanEle.get(r).getText().contains("扫码同步阅读本文") || spanEle.get(r).getText().contains("文内图片") ||
spanEle.get(r).getText().contains("图1") || spanEle.get(r).getText().contains("图 2") ||
spanEle.get(r).getText().contains("图3") || spanEle.get(r).getText().contains("图4") ||
spanEle.get(r).getText().contains("图5") || spanEle.get(r).getText().contains("图6") ||
spanEle.get(r).getText().contains("图 7") || spanEle.get(r).getText().contains("图 8") ||
spanEle.get(r).getText().contains("图 9") || spanEle.get(r).getText().contains("图10")) {
} else {
shaiXuanKeyWord = spanEle.get(r).getText();
if (shaiXuanKeyWord.contains("关键词")) {
// 存储含有关键字的论文和出版年月
bufferedWriterKey2.write(shaiXuanKeyWord+"\n");
bufferedWriterKey2.write(yearListRe.get(j)+"\n");
bufferedWriterKey2.flush();
GuanjianciStr += shaiXuanKeyWord;
}
mainContent += shaiXuanKeyWord + "\n";
}
}
}
}else {
GuanjianciStr += "为空";
mainContent += "为空";
}
// 截取关键字中的冒号后面的内容
String GuanjianciStrRe = GuanjianciStr.substring(GuanjianciStr.indexOf(":") + 1);
GuanjianCi = GuanjianciStrRe.replace(" ", "");
// 抓取论文的下载链接
String paperHref = "";
WebElement downEle = firedriver.findElement(By.id("DownLoadParts"));
List<WebElement> downList = downEle.findElements(By.tagName("a"));
for (int k=0;k<downList.size();k++){
//System.out.println(downList.get(k).getText());
String string = downList.get(k).getText();
if (string.equals("CAJ下载")){
paperHref = downList.get(k).getAttribute("href");
}else if (string.equals("整本下载")){
paperHref = downList.get(k).getAttribute("href");
}else if (string.length() == 0){
paperHref = "为空";
}
}
//写进文本文件中
// bufferedWriter.write("标题:" + TitleStr + "\n作者:" + AuthorStr + "\n单位:" + DepartStr + "\n" + mainContent + "\n\n");
bufferedWriter.write("标题:" + TitleStr + "\n作者:" + AuthorStr + "\n单位:" + DepartStr + "\n" + mainContent+"出版年月:"+ yearListRe.get(j)+"\n"+"下载链接:"+paperHref + "\n\n");
bufferedWriter.flush();
// 将关键词写进文本文件中
bufferedWriterKey.write(GuanjianCi);
bufferedWriterKey.flush();
// 数据的爬取 结束/
//driver.navigate().back();
//Thread.sleep(1000);
}
}
in.close();
bufferedWriter.close();
bufferedWriterKey.close();
fileWriterKey.close();
fileWriter.close();
bufferedWriterKey2.close();
fileKeyTest2.getClass();
}
/**
* 这个方法主要是返回词频最高的五个关键词以及他们的频次。
* @return
* @throws IOException
*/
@RequestMapping("/getKeyWord")
@ResponseBody
public Map<String, Map<String, Integer>> getKeyWord() throws IOException {
File path1 = new File(ResourceUtils.getURL("classpath:").getPath());
if (!path1.exists()) path1 = new File("");
// System.out.println("path:"+path1.getAbsolutePath());
File upload = new File(path1.getAbsolutePath(), "src/main/webapp/data");
if (!upload.exists()) upload.mkdirs();
String pathKey = upload.getAbsolutePath() + "\\textKey.txt";
// 路径文件
File filekey = new File(pathKey);
FileReader fileReader = new FileReader(filekey);
BufferedReader bufferedReader = new BufferedReader(fileReader);
// 统计词频
String file = "";
String s;
while ((s = bufferedReader.readLine()) != null) {
file += s;
}
// System.out.println(file);
StringTokenizer st = new StringTokenizer(file, ";"); //用于切分字符串
Map<String, Integer> hm = new HashMap<String, Integer>();
while (st.hasMoreTokens()) {
String word = st.nextToken();
if (hm.get(word) != null) {
int value = ((Integer) hm.get(word)).intValue();
value++;
hm.put(word, new Integer(value));
} else {
hm.put(word, new Integer(1));
}
}
//1、按顺序保存map中的元素,使用LinkedList类型
List<Map.Entry<String, Integer>> keyList = new LinkedList<Map.Entry<String, Integer>>(hm.entrySet());
// System.out.println("一共有 "+keyList.size()+" 个关键词");
//2、按照自定义的规则排序
Collections.sort(keyList, new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1,
Map.Entry<String, Integer> o2) {
if (o2.getValue().compareTo(o1.getValue()) > 0) {
return 1;
} else if (o2.getValue().compareTo(o1.getValue()) < 0) {
return -1;
} else {
return 0;
}
}
});
//3、将LinkedList按照排序好的结果,存入到HashMap中
HashMap<String, Integer> result = new LinkedHashMap<>();
for (Map.Entry<String, Integer> entry : keyList) {
result.put(entry.getKey(), entry.getValue());
}
List<String> keylist =new ArrayList<>(result.keySet());
List<String> fivekeyList = new ArrayList<>();
fivekeyList = keylist.subList(0,5); //取前五个元素
List<Integer> valuesList = new ArrayList<Integer>(result.values());
List<Integer> fivekeyValueList = new ArrayList<>();
fivekeyValueList = valuesList.subList(0,5);
// 将取到的前五个合并为map
Map<String,Integer> resultMap = new HashMap();
for (int k=0;k<5;k++){
resultMap.put(fivekeyList.get(k),fivekeyValueList.get(k));
}
// 遍历resultMap
/*for (String key : resultMap.keySet()) {
System.out.println(key + ""+resultMap.get(key));
}*/
Map<String, Map<String, Integer>> map = new HashMap<String,Map<String, Integer>>();
map.put("resultMap", resultMap);
//String jsonString = JSON.toJSONString(map);
return map;
}
/**
* 使用火狐浏览器
*/
@RequestMapping("/fire")
@ResponseBody
public void fire(){
WebDriver firedriver;
//火狐的安装位置
System.setProperty("webdriver.firefox.bin","D:\\fireBrower\\firefox.exe");
//加载驱动
System.setProperty("webdriver.firefox.marionette","D:\\fireBrower\\geckodriver.exe");
firedriver=new FirefoxDriver();
firedriver.get("http://sms.webchinese.cn/api.shtml");
}
}