Okhttp+Jsoup爬取网页存储到Mongo的简单实现

工具IntelliJ Idae

  • Maven
<dependencies>
    <dependency>
     <groupId>org.jsoup</groupId>
     <artifactId>jsoup</artifactId>
     <version>1.11.3</version>
    </dependency>

    <dependency>
        <groupId>org.mongodb</groupId>
        <artifactId>bson</artifactId>
        <version>3.6.4</version>
    </dependency>

    <dependency>
        <groupId>com.google.cloud.trace.instrumentation.jdbc</groupId>
        <artifactId>driver</artifactId>
        <version>0.1.1</version>
        <type>pom</type>
    </dependency>
    <dependency>
        <groupId>ch.qos.logback.contrib</groupId>
        <artifactId>logback-mongodb-access</artifactId>
        <version>0.1.5</version>
    </dependency>
    <!-- MongoDB数据库连接驱动 -->
    <dependency>
        <groupId>org.mongodb</groupId>
        <artifactId>mongo-java-driver</artifactId>
        <version>3.0.0</version>
    </dependency>
    <dependency>
        <groupId>com.squareup.okio</groupId>
        <artifactId>okio</artifactId>
        <version>1.11.0</version>

    </dependency>
    <dependency>
        <groupId>com.squareup.okhttp3</groupId>
        <artifactId>okhttp</artifactId>
        <version>3.6.0</version>
    </dependency>

    <dependency>
        <groupId>com.alibaba</groupId>
        <artifactId>fastjson</artifactId>
        <version>1.2.47</version>
    </dependency>
</dependencies>
  • Okhttp实现类
package DATA;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;

public  class CrawlData {
    /**
     * 通过url下载网页内容数据
     *
     * @param url
     * @return
     */
    public static String downloadHtml(String url) {
        String body = null;
        OkHttpClient client = new OkHttpClient();
        //请求
        Request request = new Request.Builder().url(url).build();
        //发起请求
        try {

            Response response = client.newCall(request).execute();
            body = new String(response.body().bytes());

        } catch (Exception e) {
            e.printStackTrace();
        }
        return body;//取得目标
    }
}

取得html后传递给jsoup进行解析

  • Jsoup实现类
package DATA;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.*;
public class Analysis {
    /**
     * 解析数据
     * @param htmlBody
     * @return
     * @throws IOException
     */
    public static List<Map<String,Object>> analysisData(String htmlBody) throws Exception {
        // 获取目标HTML代码
        List<Map<String,Object>> list = new ArrayList<Map<String,Object>>();
        Document doc = Jsoup.parse(htmlBody);
        Elements elements = doc.select("ul.listCentent").select("li");
        System.out.println(" 数据集合大小=====" + elements.size());
//foreach循环
        for (Element elmemt : elements) {
            Map<String,Object> map1=new HashMap<String,Object>();
            //获取公司名
            String siteName = elmemt.select("div.CentTxt > h3.rightTxtHead > a").text();
            System.out.println("siteName=====" + siteName);
            //获取域名
            String domainName = elmemt.select("div.CentTxt > h3.rightTxtHead > span").text();
            System.out.println("domainName=====" + domainName);
            //获取AlexaRank排名
            String AlexaRank = elmemt.select("li.clearfix >div.CentTxt > div.RtCPart >p").text();
            System.out.println("AlexaRank=====" + AlexaRank);
            //获取公司简介
            String Synopsis = elmemt.select("div.CentTxt> p").text();
            System.out.println("公司简介====" + Synopsis);
            //获取得分
            String score = elmemt.select("div.RtCRateCent>span").text();
            System.out.println(score);
            //获取排名
            String siteRank = elmemt.select("div.RtCRateCent> strong").text();
            System.out.println("排名:" + siteRank);
            //获取网址
            String webSite = "http://top.chinaz.com"+elmemt.select("a").first().attr("href");
            System.out.println("网址:" + webSite);
            //获取备案信息
            String stringecordInformation = getGecordInformation(webSite);
            System.out.println("备案信息"+stringecordInformation);
            System.out.println("\t");
//            StoreData.add(siteName,domainName, AlexaRank , Synopsis, score, siteRank, webSite ,RecordInformation);
            map1.put("siteName",siteName);
            map1.put("domainName",domainName);
            map1.put("AlexaRank",AlexaRank);
            map1.put("公司简介",Synopsis);
            map1.put("排名",siteRank);
            map1.put("网址",webSite);
            map1.put("备案信息",stringecordInformation);
            list.add(map1);
            try {
                Thread.sleep(100);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
        return list;
    }
    /**
     * 获取备案信息
     * @param url
     * @return
     * @throws Exception
     */
    private static String getGecordInformation(String url) throws Exception{
        String htmlBody = CrawlData.downloadHtml(url);
        if(htmlBody != null){
            Document doc = Jsoup.parse(htmlBody);
            String stringecordInformation = doc.select("li.TMain06List-Left>p").text();
            return stringecordInformation;
        }
        return null;
    }
}
  • 存入Mongo
package DATA;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import org.bson.Document;
import java.util.Map;

public class StoreData{
    public static void main( String args[] ){
    }

    public static void adds(Map<String,Object> dataMap){
        try{
            // 连接到 mongodb 服务String siteRank
            MongoClient mongoClient = new MongoClient( "localhost" , 27017 );
            // 连接到数据库
            MongoDatabase mongoDatabase = mongoClient.getDatabase("sit_rank");
            System.out.println(mongoDatabase);
            System.out.println("成功连接数据库");

            MongoCollection<Document> collection = mongoDatabase.getCollection("information");
            System.out.println(collection);
            System.out.println("集合 information 选择成功");
            //插入文档
            /**
             * 1. 创建文档 org.bson.Document 参数为key-value的格式
             * 2. 创建文档集合List<Document>
             * 3. 将文档集合插入数据库集合中 mongoCollection.insertMany(List<Document>) 插入单个文档可以用 mongoCollection.insertOne(Document)
             * */
            String siteName=null;String domainName=null;String AlexaRank=null;String Synopsis=null;
                    String score=null;String siteRank=null;String webSite=null;String RecordInformation=null;
            JSONObject josn = JSONObject.parseObject(dataMap.toString());
                    Document document = new Document(josn);
            document.put("_id",siteName);
            document.append("domainName", domainName);
            document.append("AlexaRank",AlexaRank);
            document.append("Synopsis",Synopsis);
            document.append("score",score);
            document.append("siteRank",siteRank);
            document.append("webSite",webSite);
            document.append("RecordInformation",RecordInformation);
            collection.insertOne(document);
            System.out.println("文档插入成功");

            //关闭mongodb连接
            mongoClient.close();
            System.out.println("MongoDB连接已关闭");
        }catch(Exception e){
            System.err.println( e.getClass().getName() + ": " + e.getMessage() );
        }
    }

}
  • 测试类
package DATA;
import java.util.List;
import java.util.Map;

public class ExecuteTask {
public static  void main(String[]args) throws Exception {
    //调用downloadHtml下载网页
    CrawlData crawlData =new CrawlData();
    String url =null;
    url="http://top.chinaz.com/all/index.html";
    System.out.println("开始爬取,请等待.");
        String htmlBody = crawlData.downloadHtml(url);
    System.out.println("爬取成功");
       //将下载的数据进行分析
        List<Map<String,Object>> dataList = Analysis.analysisData(htmlBody);
    System.out.println("数据解析成功");
        for(Map<String,Object> data : dataList){
            StoreData.adds(data);
            System.out.println("存储成功");
        }
    }
}





本文是对站长之家少量的数据的爬取,测试时最好添加线程睡眠,以免服务器压力过大.

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值