爱词霸 最新爬取 Java + mysql (含源码+数据库) 感觉金山词库的内容相对来说是最完整的!研究了一天,通过爬取html整出来的! 当然只是教程,切勿真实爬取。具体实现为什么这么做,不做阐述!切勿实际爬取、切勿实际爬取、切勿实际爬取。本人不负责任何责任,只是单纯的学术研究!!
放点代码片段吧
import com.alibaba.fastjson.JSON;
import com.english.dto.iciba.DataDict;
import com.english.dto.iciba.WordInfo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;
/**
* Hello world!
*/
class GetPostTest {
public static String sendGet(String url, String param) {
String result = "";
String urlName = url + "?" + param;
try {
URL realURL = new URL(urlName);
URLConnection conn = realURL.openConnection();
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36");
conn.connect();
Map<String, List<String>> map = conn.getHeaderFields();
for (String s : map.keySet()) {
System.out.println(s + "-->" + map.get(s));
}
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
String line;
while ((line = in.readLine()) != null) {
result += "\n" + line;
}
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
public static String sendPost(String url,String param){
String result = "";
try {
URL realUrl = new URL(url);
URLConnection conn = realUrl.openConnection();
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36");
//post设置如下两行
conn.setDoOutput(true);
conn.setDoInput(true);
PrintWriter out = new PrintWriter(conn.getOutputStream());
out.print(param);
out.flush();
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(),"utf-8"));
String line;
while((line = in.readLine()) != null){
result +="\n" + line;
}
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
public static WordInfo getWordInfo(String word){
String sendRecvGet = GetPostTest.sendGet("https://www.iciba.com/word", "w="+word);
if(null!=sendRecvGet && ""!=sendRecvGet){
int a=sendRecvGet.toString().indexOf("{\"props\":{\"pageProps\"");
int b=sendRecvGet.toString().indexOf("</script></body>");
if(a!=0 && b!=0){
String bean=sendRecvGet.substring(a,b);
if(null!=bean && ""!=bean){
System.out.println(bean);
DataDict data= Json2Pojo(bean, DataDict.class);
if(null!=data){
if(null!= data.getProps() && null!= data.getProps().getPageProps() && null!=data.getProps().getPageProps().getInitialReduxState()
&& null!=data.getProps().getPageProps().getInitialReduxState().getWord()
&& null!=data.getProps().getPageProps().getInitialReduxState().getWord().getWordInfo()){
WordInfo wordInfo= data.getProps().getPageProps().getInitialReduxState().getWord().getWordInfo();
return wordInfo;
}
}
}
}
}
return null;
}
public static <T> T Json2Pojo(String str, Class<T> clazz) {
return JSON.parseObject(JSON.parse(str).toString(), clazz);
}
}
没写注释,自行理解吧!
这是主要的代码,然后结构化,因为各自需求不一样,所以需要单独写!
具体实现的源码,我放到另外的链接里面了!
另外一个需要注意的是,金山人包括程序员们都会做流量监控,所以你懂的!