WebDriver driver = SeleniumUtil.getNormalDriver();
Set<String> set1=new HashSet<String>();
for(String p:FileUtil.getChildrenPaths("c:/nlp/深言")) {
for(String r:TxtUtil.getRows(p)) {
if(r.contains(">>>")) {
r=r.substring(3);
if(r.contains("(")) r=r.substring(0,r.indexOf("("));
if(r.length()>1&&r.length()<5) {
set1.add(r);
}
}else
if(!r.contains(">")) {
for(String s:r.split(" ")) {
if(s.length()>1&&s.length()<5) {
set1.add(s);
}
}
}
}
}
// FileUtil.overwriteTxt(ListUtil.list2Str(ListUtil.orderChinese(SetUtil.set2List(set1)),"\n"), "c:/nlp/深言.txt");
for (String zi : set1) {
if (FileUtil.isFileExists("c:/nlp/深言/" + zi + ".txt"))
continue;
if(!zi.matches("[\u4e00-\u9fa5]+")) continue;
try {
driver.get("https://www.shenyandayi.com/wantWordsResult?lang=zh&query="
+ EncodeUtils.urlEncode(zi, "utf-8")+"&category=1001");
SeleniumUtil.waitAppear(driver, 10, By.id("word-item-wrapper-相关词-01"));
if (SeleniumUtil.exists(driver, By.className("expand-text"))) {
SeleniumUtil.click(driver, driver.findElement(By.className("expand-text")));
}
String page = driver.getPageSource();
StringBuffer sb = new StringBuffer();
if(SeleniumUtil.exists(driver, By.className("basic-info"))) {
String basicinfo = driver.findElement(By.className("basic-info")).getAttribute("innerHTML");
sb.append(">字义\n");
for (String card : HtmlUtil.find(basicinfo, "div[class=card-item]")) {
String explanation = HtmlUtil.find(card, "div[class=explanation]").get(0);
explanation = HtmlUtil.innerText(explanation);
sb.append(">>" + explanation + "\n");
for (String example : HtmlUtil.find(card, "div[class=example]")) {
example = HtmlUtil.innerText(example);
sb.append(">>>" + StringUtil.killEndToken(example.replace(" ", ""),"。") + "\n");
}
}
}
for (String synony_content : HtmlUtil.find(page, "div[class=synony-content]")) {
String type = HtmlUtil
.innerText(HtmlUtil.find(synony_content, "span[class=synony-antony-title]").get(0));
sb.append(">" + type + "\n");
for (String word_item : HtmlUtil.find(synony_content, "span[class=word-item]")) {
word_item = HtmlUtil.innerText(word_item);
sb.append(">>" + word_item + "\n");
}
}
int xiangguanPageNum = 1;
if(SeleniumUtil.exists(driver, By.id("wantwordResultCard-相关词"))) {
String xiangguan = driver.findElement(By.id("wantwordResultCard-相关词")).getAttribute("innerHTML");
xiangguanPageNum = HtmlUtil.find(xiangguan, "ul[class=ant-pagination ant-pagination-mini]>li")
.size() - 1;
ArrayList<String> allxiangguan = new ArrayList<>();
List<String> cis = HtmlUtil.find(
driver.findElement(By.cssSelector("#wantwordResultCard-相关词 div.wantword-card-content"))
.getAttribute("innerHTML"),
"span[class=word-item]>span[class=item-value-content][value]");
allxiangguan.addAll(cis);
for (int i = 3; i <= xiangguanPageNum; i++) {
SeleniumUtil.click(driver, driver.findElement(
By.cssSelector("#wantwordResultCard-相关词 div.page-nation ul li:nth-child(" + i + ")")));
Thread.sleep(100);
List<String> cis2 = HtmlUtil.find(
driver.findElement(By.cssSelector("#wantwordResultCard-相关词 div.wantword-card-content"))
.getAttribute("innerHTML"),
"span[class=word-item]>span[class=item-value-content][value]");
while (cis.equals(cis2)) {
Thread.sleep(100);
cis2 = HtmlUtil.find(
driver.findElement(
By.cssSelector("#wantwordResultCard-相关词 div.wantword-card-content"))
.getAttribute("innerHTML"),
"span[class=word-item]>span[class=item-value-content][value]");
}
allxiangguan.addAll(cis2);
}
sb.append(">相关词\n" + ListUtil.list2Str(allxiangguan, " ") + "\n");
}
int lianxiangPageNum = 1;
if(SeleniumUtil.exists(driver, By.id("wantwordResultCard-联想词"))) {
String lianxiang = driver.findElement(By.id("wantwordResultCard-联想词")).getAttribute("innerHTML");
lianxiangPageNum = HtmlUtil.find(lianxiang, "ul[class=ant-pagination ant-pagination-mini]>li")
.size() - 1;
ArrayList<String> alllianxiang = new ArrayList<>();
List<String> _cis = HtmlUtil.find(
driver.findElement(By.cssSelector("#wantwordResultCard-联想词 div.wantword-card-content"))
.getAttribute("innerHTML"),
"span[class=word-item]>span[class=item-value-content][value]");
alllianxiang.addAll(_cis);
for (int i = 3; i <= lianxiangPageNum; i++) {
SeleniumUtil.click(driver, driver.findElement(
By.cssSelector("#wantwordResultCard-联想词 div.page-nation ul li:nth-child(" + i + ")")));
Thread.sleep(100);
List<String> _cis2 = HtmlUtil.find(
driver.findElement(By.cssSelector("#wantwordResultCard-联想词 div.wantword-card-content"))
.getAttribute("innerHTML"),
"span[class=word-item]>span[class=item-value-content][value]");
while (_cis.equals(_cis2)) {
Thread.sleep(100);
_cis2 = HtmlUtil.find(
driver.findElement(
By.cssSelector("#wantwordResultCard-联想词 div.wantword-card-content"))
.getAttribute("innerHTML"),
"span[class=word-item]>span[class=item-value-content][value]");
}
alllianxiang.addAll(_cis2);
}
sb.append(">联想词\n" + ListUtil.list2Str(alllianxiang, " "));
}
FileUtil.writeTxt(sb.toString(), "c:/nlp/深言/" + zi + ".txt");
} catch (Exception e) {
e.printStackTrace();
System.out.println(zi);
}
}
获取深言词汇
最新推荐文章于 2024-01-07 21:00:51 发布
文章描述了如何使用SeleniumWebDriver进行网页爬取,获取深言网中特定词语的字义、相关词和联想词,并将结果保存到文本文件中。
摘要由CSDN通过智能技术生成