public static void main(String[] args) throws IOException {
Connection conn = DbUtil.getBestConn();
String tem = "";
try {
DbUtil db = new DbUtil(conn);
for(String word:"abcdefghijklmnopqrstuvwxyz".split("")) {
for (int i = 1; i < 10; i++) {
String page = SpiderUtil.getPageContent("https://www.merriam-webster.com/wordfinder/classic/begins/common/-1/"+word+"/"+i);
Set<String> cis=new HashSet<>();
List<String> as = HtmlUtil.find(page, "div[class=panel-body]>li>a");
for(String a:as) {
String ci=HtmlUtil.getValue(a);
if(ci.length()<2) continue;
String href=HtmlUtil.propertyValue(a, "href");
if(href.equals("/dictionary/"+ci)) {
cis.add(ci);
}
}
if(cis.size()==0) break;
ExecutorService executor = Executors.newFixedThreadPool(20); // 创建一个包含10个线程的线程池
for (String ci : cis) {
executor.submit(() -> {
System.out.println(ci);
if (FileUtil.isFileExists("c:/comingSun/英语/merriam词汇/" + ci + "txt")) return;
String file = SpiderUtil.getPageContent("https://www.merriam-webster.com/dictionary/" + ci);
makeFile(ci, file);
});
}
ThreadUtil.runUntilTerminated(executor);
}
}
// extracteWord();
} catch (Exception e) {
System.out.println(tem);
e.printStackTrace();
} finally {
DbUtil.close(conn);
}
}
private static void makeFile(String ci, String file) {
FileUtil.overwriteTxt(file, "c:/comingSun/英语/merriam词汇_存储/"+ci+".txt");
StringBuffer sb=new StringBuffer();
file=HtmlUtil.find(file, "div[id=left-content]").get(0);
List<String> hword = HtmlUtil.find(file, "h1[class=hword][value]");
List<String> cixing = HtmlUtil.find(file, "h2[class=parts-of-speech]>a[value]");
List<String> shitai = HtmlUtil.find(file, "span[class=if][value]");
sb.append("<词性>"+ListUtil.list2Str(ListUtil.killSame(cixing), ",")+"\r\n");
sb.append("<时态>"+ListUtil.list2Str(ListUtil.killSame(shitai), ",")+"\r\n");
List<String> xiangguan = HtmlUtil.find(file, "div[class=entry-uros]>div[class=uro-content]");
if(xiangguan.size()>0) {
sb.append("<相关词态>\r\n");
Map<String,String> mm=new HashMap<>();
for(String xg:xiangguan) {
mm.put(HtmlUtil.find(xg, "span[class=ure][value]").get(0),HtmlUtil.find(xg, "span[class=fl][value]").get(0));
}
for(String k:mm.keySet()) {
sb.append(k+"["+mm.get(k)+"]\r\n");
}
sb.append("</相关词态>\r\n");
}
List<String> items = HtmlUtil.find(file, "div[class=vg-sseq-entry-item]>span[class=ex-sent]");
if(items.size()>0) {
sb.append("<词义相关句子>\r\n");
Set<String> re=new HashSet<>();
for(String item:items) {
item=StringUtil.kill(item, "<", ">");
if(item.startsWith("—")) continue;
re.add(item);
}
sb.append(SetUtil.set2Str(re,"\r\n")+"\r\n");
sb.append("</词义相关句子>\r\n");
}
List<String> relate_phrase = HtmlUtil.find(file, "div[class=related-phrases-list-container]>li>a>span[value]");
if(relate_phrase.size()>0) {
FileUtil.writeTxt(relate_phrase, "c:/comingSun/英语/词组.txt");
TxtUtil.distinctRow("c:/comingSun/英语/词组.txt");
}
List<String> phrasess = HtmlUtil.find(file, "div[id=phrases]");
if(phrasess.size()>0) {
String phrases=phrasess.get(0);
Map<String, Integer> spanmap=new HashMap<>();
List<String> drps = HtmlUtil.find(phrases, "span[class=drp]");
for(String drp:drps) {
String cz = HtmlUtil.getValue(drp);
spanmap.put("<"+HtmlUtil.changeHtmlSymbol(cz)+">",phrases.indexOf(drp) );
}
List<String> spans = HtmlUtil.find(phrases, "div[class=sub-content-thread]>span");
for(String drp:spans) {
String content=HtmlUtil.getValue(drp);
if(content!=null&&content.contains("<em"))
spanmap.put(HtmlUtil.changeHtmlSymbol(content),phrases.indexOf(drp) );
}
spans = HtmlUtil.find(phrases, "span[class=mw_t_sp]");
for(String drp:spans) {
String content=HtmlUtil.getValue(drp);
if(content!=null&&content.contains("<em"))
spanmap.put(HtmlUtil.changeHtmlSymbol(content),phrases.indexOf(drp) );
}
List<String> orderspan = MapUtil.orderKeyByValue(spanmap);
sb.append("<词组>\r\n");
for(String s:orderspan) {
s=StringUtil.kill(s, "<em", ">").replace("</em>", "");
sb.append(s+"\r\n");
}
sb.append("</词组>\r\n");
}
Map<String,List<String>> phrase_jz=new HashMap<>();
List<String> juzilist = HtmlUtil.find(file,"div[class=in-sentences-container]");;
if(juzilist.size()>0) {
String juzi = juzilist.get(0);
List<String> jz_label=new ArrayList<>();
List<Integer> jz_start=new ArrayList<>();
List<String> juzi_heads = HtmlUtil.find(juzi, "span[class=ex-header]");
for(String s:juzi_heads) {
int start=juzi.indexOf(s);
String label=HtmlUtil.getValue(s);
jz_label.add(label);
jz_start.add(start);
}
List<String> juzi_jus = HtmlUtil.find(juzi, "span[class=thread-anchor-content]");
for(String span:juzi_jus) {
int idx=juzi.indexOf(span);
String type = type(jz_label,jz_start,idx);
span=span.substring(span.indexOf(">")+1);
span=span.substring(0,span.lastIndexOf("<")).trim();
span=StringUtil.kill(span, "<", ">").trim();
phrase_jz=MapUtil.putMapList(phrase_jz, type, span.replace("<em>", "").replace("</em>", ""));
}
sb.append("<句子例子>\r\n");
for(String key:phrase_jz.keySet()) {
List<String> list = phrase_jz.get(key);
if(!key.isEmpty())sb.append("<"+key+">\r\n");
for(String ss:list) {
sb.append(ss+"\r\n");
}
}
sb.append("</句子例子>\r\n");
}
Map<String,List<String>> phrase=new HashMap<>();
List<String> sentencelist = HtmlUtil.find(file, "div[class=on-web-container]");
if(sentencelist.size()>0) {
String sentence = sentencelist.get(0);
List<String> labels = HtmlUtil.find(sentence, "div[class=function-label-header]");
List<String> ls_label=new ArrayList<>();
List<Integer> ls_start=new ArrayList<>();
for(String l:labels) {
int start = sentence.indexOf(l);
String label = HtmlUtil.getValue(l);
ls_label.add(label);
ls_start.add(start);
}
List<String> sentences = HtmlUtil.find(sentence, "span");
for(String span:sentences) {
int idx=sentence.indexOf(span);
List<String> find = HtmlUtil.find(span, "span[class=has-aq][value]");
if(find.size()==0||find.get(0).contains("Find it on Amazon")) continue;
String type = type(ls_label,ls_start,idx);
phrase=MapUtil.putMapList(phrase, type, find.get(0).replace("<em>", "").replace("</em>", ""));
}
sb.append("<网络例子>\r\n");
for(String key:phrase.keySet()) {
List<String> list = phrase.get(key);
if(!key.isEmpty())sb.append("<"+key+">\r\n");
for(String ss:list) {
sb.append(ss+"\r\n");
}
}
sb.append("</网络例子>\r\n");
}
FileUtil.overwriteTxt(sb.toString(), "c:/comingSun/英语/merriam词汇/"+ci+".txt");
}
private static String type(List<String> labels,List<Integer> idxs,int idx) {
for (int i = 0; i < idxs.size(); i++) {
int x=idxs.get(i);
if(i<idxs.size()-1&&idx>x&&idx<idxs.get(i+1)) {//不是最后一个
return labels.get(i);
}
if(i==idxs.size()-1) {
return labels.get(i);
}
}
return "";
}
爬英语单词
最新推荐文章于 2024-05-21 19:15:00 发布