引入Jsoup和分词器的JAR包
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<!--分词-->
<!-- https://mvnrepository.com/artifact/com.hankcs/hanlp -->
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.7.2</version>
</dependency>
工具类
import com.hankcs.hanlp.HanLP;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HtmlUtil {
private static String url ="https://www.baidu.com/s";
public static void main(String[] args) {
String rela = GetRela("哥哥的儿子");
System.out.println(rela);
}
public static String GetRela(String wd) {
StringBuffer English=new StringBuffer();
StringBuffer Chinese=new StringBuffer();
try {
Document doc = Jsoup.connect(url+"?wd="+wd+"怎么称呼") .get();
Elements body = doc.getElementsByTag("body");
for (Element Text : body) {
String text = Text.text();
for(int i=0;i<text.length();i++) {
char c=text.charAt(i);
if(c >= 0x4E00 && c <= 0x9FA5) {
Chinese.append(c);
}
else if ((c>='a' && c<='z') || (c>='A' && c<='Z') || c==' ') {
English.append(c);
}
}
}
}
catch (IOException e) {
e.printStackTrace();
}
String EnglishText=new String(English);
String ChineseText=new String(Chinese);
String code = null;
String regString = ""+wd+"怎么称呼"+"([\\d\\D]*)来自百度百科";
Pattern pattern = Pattern.compile(regString);
Matcher matcher = pattern.matcher(ChineseText);
if (matcher.find()) {
code = matcher.group(1);
List<String> list = HanLP.extractKeyword(code, 2);
return list.get(0);
}
return null;
}
}