最新爱词霸 Java + mysql （含源码+数据库）

p799411891

已于 2022-08-09 09:42:04 修改

阅读量249

点赞数

分类专栏： java总结 DB 文章标签： java 数据库 mysql

于 2022-08-09 09:39:16 首次发布

本文链接：https://blog.csdn.net/p799411891/article/details/126241074

版权

java总结同时被 2 个专栏收录

6 篇文章 0 订阅

订阅专栏

4 篇文章 0 订阅

订阅专栏

爱词霸最新爬取 Java + mysql （含源码+数据库）感觉金山词库的内容相对来说是最完整的！研究了一天，通过爬取html整出来的！当然只是教程，切勿真实爬取。具体实现为什么这么做，不做阐述！切勿实际爬取、切勿实际爬取、切勿实际爬取。本人不负责任何责任，只是单纯的学术研究！！

放点代码片段吧


import com.alibaba.fastjson.JSON;
import com.english.dto.iciba.DataDict;
import com.english.dto.iciba.WordInfo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;

/**
 * Hello world!
 */
class GetPostTest {
    public static String sendGet(String url, String param) {
        String result = "";
        String urlName = url + "?" + param;
        try {
            URL realURL = new URL(urlName);
            URLConnection conn = realURL.openConnection();
            conn.setRequestProperty("accept", "*/*");
            conn.setRequestProperty("connection", "Keep-Alive");
            conn.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36");
            conn.connect();
            Map<String, List<String>> map = conn.getHeaderFields();
            for (String s : map.keySet()) {
                System.out.println(s + "-->" + map.get(s));
            }
            BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
            String line;
            while ((line = in.readLine()) != null) {
                result += "\n" + line;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return result;
    }

    public static String sendPost(String url,String param){
        String result = "";
        try {
            URL realUrl = new URL(url);
            URLConnection conn = realUrl.openConnection();
            conn.setRequestProperty("accept", "*/*");
            conn.setRequestProperty("connection", "Keep-Alive");
            conn.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36");
            //post设置如下两行
            conn.setDoOutput(true);
            conn.setDoInput(true);
            PrintWriter out = new PrintWriter(conn.getOutputStream());
            out.print(param);
            out.flush();
            BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(),"utf-8"));
            String line;
            while((line = in.readLine()) != null){
                result +="\n" + line;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return result;
    }

    public static WordInfo getWordInfo(String word){
        String sendRecvGet = GetPostTest.sendGet("https://www.iciba.com/word", "w="+word);
        if(null!=sendRecvGet && ""!=sendRecvGet){
            int a=sendRecvGet.toString().indexOf("{\"props\":{\"pageProps\"");
            int b=sendRecvGet.toString().indexOf("</script></body>");
            if(a!=0 && b!=0){
                String bean=sendRecvGet.substring(a,b);
                if(null!=bean && ""!=bean){
                    System.out.println(bean);
                    DataDict data=  Json2Pojo(bean, DataDict.class);
                    if(null!=data){
                        if(null!= data.getProps() && null!= data.getProps().getPageProps()  && null!=data.getProps().getPageProps().getInitialReduxState()
                        && null!=data.getProps().getPageProps().getInitialReduxState().getWord()
                        && null!=data.getProps().getPageProps().getInitialReduxState().getWord().getWordInfo()){
                            WordInfo  wordInfo= data.getProps().getPageProps().getInitialReduxState().getWord().getWordInfo();
                            return wordInfo;
                        }
                    }
                }
            }
        }
       return null;
    }

    public static <T> T Json2Pojo(String str, Class<T> clazz) {
        return JSON.parseObject(JSON.parse(str).toString(), clazz);
    }
}

没写注释，自行理解吧！

这是主要的代码，然后结构化，因为各自需求不一样，所以需要单独写！
具体实现的源码，我放到另外的链接里面了！
另外一个需要注意的是，金山人包括程序员们都会做流量监控，所以你懂的！

金山爱词霸最新爬取 Java + mysql （含源码+数据库）