获取深言词汇

文章描述了如何使用SeleniumWebDriver进行网页爬取,获取深言网中特定词语的字义、相关词和联想词,并将结果保存到文本文件中。
摘要由CSDN通过智能技术生成
WebDriver driver = SeleniumUtil.getNormalDriver();
			
			Set<String> set1=new HashSet<String>();
			for(String p:FileUtil.getChildrenPaths("c:/nlp/深言")) {
				for(String r:TxtUtil.getRows(p)) {
					if(r.contains(">>>")) {
						r=r.substring(3);
						if(r.contains("(")) r=r.substring(0,r.indexOf("("));
						if(r.length()>1&&r.length()<5) {
							set1.add(r);
						}
					}else
					if(!r.contains(">")) {
						for(String s:r.split(" ")) {
							if(s.length()>1&&s.length()<5) {
								set1.add(s);
							}
						}
					}
				}
			}
//			FileUtil.overwriteTxt(ListUtil.list2Str(ListUtil.orderChinese(SetUtil.set2List(set1)),"\n"), "c:/nlp/深言.txt");
			for (String zi : set1) {
				if (FileUtil.isFileExists("c:/nlp/深言/" + zi + ".txt"))
					continue;
				if(!zi.matches("[\u4e00-\u9fa5]+")) continue;
				
				try {
					driver.get("https://www.shenyandayi.com/wantWordsResult?lang=zh&query="
							+ EncodeUtils.urlEncode(zi, "utf-8")+"&category=1001");
					SeleniumUtil.waitAppear(driver, 10, By.id("word-item-wrapper-相关词-01"));
					if (SeleniumUtil.exists(driver, By.className("expand-text"))) {
						SeleniumUtil.click(driver, driver.findElement(By.className("expand-text")));
					}
					String page = driver.getPageSource();

					StringBuffer sb = new StringBuffer();
					if(SeleniumUtil.exists(driver, By.className("basic-info"))) {
						String basicinfo = driver.findElement(By.className("basic-info")).getAttribute("innerHTML");
						sb.append(">字义\n");
						for (String card : HtmlUtil.find(basicinfo, "div[class=card-item]")) {
							String explanation = HtmlUtil.find(card, "div[class=explanation]").get(0);
							explanation = HtmlUtil.innerText(explanation);
							sb.append(">>" + explanation + "\n");
							for (String example : HtmlUtil.find(card, "div[class=example]")) {
								example = HtmlUtil.innerText(example);
								sb.append(">>>" + StringUtil.killEndToken(example.replace(" ", ""),"。") + "\n");
							}
						}
					}

					for (String synony_content : HtmlUtil.find(page, "div[class=synony-content]")) {
						String type = HtmlUtil
								.innerText(HtmlUtil.find(synony_content, "span[class=synony-antony-title]").get(0));
						sb.append(">" + type + "\n");
						for (String word_item : HtmlUtil.find(synony_content, "span[class=word-item]")) {
							word_item = HtmlUtil.innerText(word_item);
							sb.append(">>" + word_item + "\n");
						}
					}

					int xiangguanPageNum = 1;
					if(SeleniumUtil.exists(driver, By.id("wantwordResultCard-相关词"))) {
						String xiangguan = driver.findElement(By.id("wantwordResultCard-相关词")).getAttribute("innerHTML");
						xiangguanPageNum = HtmlUtil.find(xiangguan, "ul[class=ant-pagination ant-pagination-mini]>li")
								.size() - 1;
						ArrayList<String> allxiangguan = new ArrayList<>();
						List<String> cis = HtmlUtil.find(
								driver.findElement(By.cssSelector("#wantwordResultCard-相关词 div.wantword-card-content"))
								.getAttribute("innerHTML"),
								"span[class=word-item]>span[class=item-value-content][value]");
						allxiangguan.addAll(cis);
						for (int i = 3; i <= xiangguanPageNum; i++) {
							SeleniumUtil.click(driver, driver.findElement(
									By.cssSelector("#wantwordResultCard-相关词 div.page-nation ul li:nth-child(" + i + ")")));
							Thread.sleep(100);
							List<String> cis2 = HtmlUtil.find(
									driver.findElement(By.cssSelector("#wantwordResultCard-相关词 div.wantword-card-content"))
									.getAttribute("innerHTML"),
									"span[class=word-item]>span[class=item-value-content][value]");
							while (cis.equals(cis2)) {
								Thread.sleep(100);
								cis2 = HtmlUtil.find(
										driver.findElement(
												By.cssSelector("#wantwordResultCard-相关词 div.wantword-card-content"))
										.getAttribute("innerHTML"),
										"span[class=word-item]>span[class=item-value-content][value]");
							}
							allxiangguan.addAll(cis2);
						}
						sb.append(">相关词\n" + ListUtil.list2Str(allxiangguan, " ") + "\n");
					}

					int lianxiangPageNum = 1;
					if(SeleniumUtil.exists(driver, By.id("wantwordResultCard-联想词"))) {
						String lianxiang = driver.findElement(By.id("wantwordResultCard-联想词")).getAttribute("innerHTML");
						lianxiangPageNum = HtmlUtil.find(lianxiang, "ul[class=ant-pagination ant-pagination-mini]>li")
								.size() - 1;
						ArrayList<String> alllianxiang = new ArrayList<>();
						List<String> _cis = HtmlUtil.find(
								driver.findElement(By.cssSelector("#wantwordResultCard-联想词 div.wantword-card-content"))
								.getAttribute("innerHTML"),
								"span[class=word-item]>span[class=item-value-content][value]");
						alllianxiang.addAll(_cis);
						for (int i = 3; i <= lianxiangPageNum; i++) {
							SeleniumUtil.click(driver, driver.findElement(
									By.cssSelector("#wantwordResultCard-联想词 div.page-nation ul li:nth-child(" + i + ")")));
							Thread.sleep(100);
							List<String> _cis2 = HtmlUtil.find(
									driver.findElement(By.cssSelector("#wantwordResultCard-联想词 div.wantword-card-content"))
									.getAttribute("innerHTML"),
									"span[class=word-item]>span[class=item-value-content][value]");
							while (_cis.equals(_cis2)) {
								Thread.sleep(100);
								_cis2 = HtmlUtil.find(
										driver.findElement(
												By.cssSelector("#wantwordResultCard-联想词 div.wantword-card-content"))
										.getAttribute("innerHTML"),
										"span[class=word-item]>span[class=item-value-content][value]");
							}
							alllianxiang.addAll(_cis2);
						}
						sb.append(">联想词\n" + ListUtil.list2Str(alllianxiang, " "));
					}

					FileUtil.writeTxt(sb.toString(), "c:/nlp/深言/" + zi + ".txt");
				} catch (Exception e) {
					e.printStackTrace();
					System.out.println(zi);
				}
			}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

ak01_10

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值