爬英语单词

public static void main(String[] args) throws IOException {
		Connection conn = DbUtil.getBestConn();
		String tem = "";
		try {
			DbUtil db = new DbUtil(conn);
			
			for(String word:"abcdefghijklmnopqrstuvwxyz".split("")) {
				for (int i = 1; i < 10; i++) {
					String page = SpiderUtil.getPageContent("https://www.merriam-webster.com/wordfinder/classic/begins/common/-1/"+word+"/"+i);
					Set<String> cis=new HashSet<>();
					List<String> as = HtmlUtil.find(page, "div[class=panel-body]>li>a");
					for(String a:as) {
						String ci=HtmlUtil.getValue(a);
						if(ci.length()<2) continue;
						String href=HtmlUtil.propertyValue(a, "href");
						if(href.equals("/dictionary/"+ci)) {
							cis.add(ci);
						}
					}
					if(cis.size()==0) break;
					
					ExecutorService executor = Executors.newFixedThreadPool(20); // 创建一个包含10个线程的线程池
					for (String ci : cis) {
					    executor.submit(() -> {
					        System.out.println(ci);
					        if (FileUtil.isFileExists("c:/comingSun/英语/merriam词汇/" + ci + "txt")) return;
					        String file = SpiderUtil.getPageContent("https://www.merriam-webster.com/dictionary/" + ci);
					        makeFile(ci, file);
					    });
					}
					ThreadUtil.runUntilTerminated(executor);
				}
			}
			
//			extracteWord();
		} catch (Exception e) {
			System.out.println(tem);
			e.printStackTrace();
		} finally {
			DbUtil.close(conn);
		}
	}
	private static void makeFile(String ci, String file) {
		FileUtil.overwriteTxt(file, "c:/comingSun/英语/merriam词汇_存储/"+ci+".txt");
		StringBuffer sb=new StringBuffer();
		file=HtmlUtil.find(file, "div[id=left-content]").get(0);
		List<String> hword = HtmlUtil.find(file, "h1[class=hword][value]");
		List<String> cixing = HtmlUtil.find(file, "h2[class=parts-of-speech]>a[value]");
		List<String> shitai = HtmlUtil.find(file, "span[class=if][value]");
		sb.append("<词性>"+ListUtil.list2Str(ListUtil.killSame(cixing), ",")+"\r\n");
		sb.append("<时态>"+ListUtil.list2Str(ListUtil.killSame(shitai), ",")+"\r\n");
		
		List<String> xiangguan = HtmlUtil.find(file, "div[class=entry-uros]>div[class=uro-content]");
		if(xiangguan.size()>0) {
			sb.append("<相关词态>\r\n");
			Map<String,String> mm=new HashMap<>();
			for(String xg:xiangguan) {
				mm.put(HtmlUtil.find(xg, "span[class=ure][value]").get(0),HtmlUtil.find(xg, "span[class=fl][value]").get(0));
			}
			for(String k:mm.keySet()) {
				sb.append(k+"["+mm.get(k)+"]\r\n");
			}
			sb.append("</相关词态>\r\n");
		}
		
		List<String> items = HtmlUtil.find(file, "div[class=vg-sseq-entry-item]>span[class=ex-sent]");
		if(items.size()>0) {
			sb.append("<词义相关句子>\r\n");
			Set<String> re=new HashSet<>();
			for(String item:items) {
				item=StringUtil.kill(item, "<", ">");
				if(item.startsWith("&mdash;")) continue;
				re.add(item);
			}
			sb.append(SetUtil.set2Str(re,"\r\n")+"\r\n");
			sb.append("</词义相关句子>\r\n");
		}
		
		List<String> relate_phrase = HtmlUtil.find(file, "div[class=related-phrases-list-container]>li>a>span[value]");
		if(relate_phrase.size()>0) {
			FileUtil.writeTxt(relate_phrase, "c:/comingSun/英语/词组.txt");
			TxtUtil.distinctRow("c:/comingSun/英语/词组.txt");
		}
		
		List<String> phrasess = HtmlUtil.find(file, "div[id=phrases]");
		if(phrasess.size()>0) {
			String phrases=phrasess.get(0);
			Map<String, Integer> spanmap=new HashMap<>();
			List<String> drps = HtmlUtil.find(phrases, "span[class=drp]");
			for(String drp:drps) {
				String cz = HtmlUtil.getValue(drp);
				spanmap.put("<"+HtmlUtil.changeHtmlSymbol(cz)+">",phrases.indexOf(drp) );
			}
			List<String> spans = HtmlUtil.find(phrases, "div[class=sub-content-thread]>span");
			for(String drp:spans) {
				String content=HtmlUtil.getValue(drp);
				if(content!=null&&content.contains("<em"))
				spanmap.put(HtmlUtil.changeHtmlSymbol(content),phrases.indexOf(drp) );
			}
			spans = HtmlUtil.find(phrases, "span[class=mw_t_sp]");
			for(String drp:spans) {
				String content=HtmlUtil.getValue(drp);
				if(content!=null&&content.contains("<em"))
					spanmap.put(HtmlUtil.changeHtmlSymbol(content),phrases.indexOf(drp) );
			}
			List<String> orderspan = MapUtil.orderKeyByValue(spanmap);
			sb.append("<词组>\r\n");
			for(String s:orderspan) {
				s=StringUtil.kill(s, "<em", ">").replace("</em>", "");
				sb.append(s+"\r\n");
			}
			sb.append("</词组>\r\n");
		}
		
		Map<String,List<String>> phrase_jz=new HashMap<>();
		List<String> juzilist = HtmlUtil.find(file,"div[class=in-sentences-container]");;
		if(juzilist.size()>0) {
			String juzi = juzilist.get(0);
			List<String> jz_label=new ArrayList<>();
			List<Integer> jz_start=new ArrayList<>();
			List<String> juzi_heads = HtmlUtil.find(juzi, "span[class=ex-header]");
			for(String s:juzi_heads) {
				int start=juzi.indexOf(s);
				String label=HtmlUtil.getValue(s);
				jz_label.add(label);
				jz_start.add(start);
			}
			List<String> juzi_jus = HtmlUtil.find(juzi, "span[class=thread-anchor-content]");
			for(String span:juzi_jus) {
				int idx=juzi.indexOf(span);
				String type = type(jz_label,jz_start,idx);
				span=span.substring(span.indexOf(">")+1);
				span=span.substring(0,span.lastIndexOf("<")).trim();
				span=StringUtil.kill(span, "<", ">").trim();
				phrase_jz=MapUtil.putMapList(phrase_jz, type, span.replace("<em>", "").replace("</em>", ""));
			}
			sb.append("<句子例子>\r\n");
			for(String key:phrase_jz.keySet()) {
				List<String> list = phrase_jz.get(key);
				if(!key.isEmpty())sb.append("<"+key+">\r\n");
				for(String ss:list) {
					sb.append(ss+"\r\n");	
				}
			}
			sb.append("</句子例子>\r\n");
		}
		
		Map<String,List<String>> phrase=new HashMap<>();
		List<String> sentencelist = HtmlUtil.find(file, "div[class=on-web-container]");
		if(sentencelist.size()>0) {
			String sentence = sentencelist.get(0);
			List<String> labels = HtmlUtil.find(sentence, "div[class=function-label-header]");
			List<String> ls_label=new ArrayList<>();
			List<Integer> ls_start=new ArrayList<>();
			for(String l:labels) {
				int start = sentence.indexOf(l);
				String label = HtmlUtil.getValue(l);
				ls_label.add(label);
				ls_start.add(start);
			}
			
			List<String> sentences = HtmlUtil.find(sentence, "span");
			for(String span:sentences) {
				int idx=sentence.indexOf(span);
				List<String> find = HtmlUtil.find(span, "span[class=has-aq][value]");
				if(find.size()==0||find.get(0).contains("Find it on Amazon")) continue;
				String type = type(ls_label,ls_start,idx);
				phrase=MapUtil.putMapList(phrase, type, find.get(0).replace("<em>", "").replace("</em>", ""));
			}
			
			sb.append("<网络例子>\r\n");
			for(String key:phrase.keySet()) {
				List<String> list = phrase.get(key);
				if(!key.isEmpty())sb.append("<"+key+">\r\n");
				for(String ss:list) {
					sb.append(ss+"\r\n");	
				}
			}
			sb.append("</网络例子>\r\n");
		}
		
		FileUtil.overwriteTxt(sb.toString(), "c:/comingSun/英语/merriam词汇/"+ci+".txt");
	}
	private static String type(List<String> labels,List<Integer> idxs,int idx) {
		for (int i = 0; i < idxs.size(); i++) {
			int x=idxs.get(i);
			if(i<idxs.size()-1&&idx>x&&idx<idxs.get(i+1)) {//不是最后一个
				return labels.get(i);
			}
			if(i==idxs.size()-1) {
				return labels.get(i);
			}
		}
		return "";
	}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

ak01_10

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值