jsoup爬虫,项目实战,欢迎收看

import com.mongodb.BasicDBObject
import com.mongodb.DBCollection
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements

public class ZhongYuan {
    public static final DBCollection test = MongoUtils.getCollectionByName("name", "table", 
"port")
    public static final DBCollection html = MongoUtils.getCollectionByName("name", "table", 
"port")

    public static void main(String[] args){
//        循环遍历页面进行数据爬去
        for(int i = 500 ; i<598 ;i++) {
            String url = "http://sh.centanet.com/xiaoqu/g"+i+"/";
            String result = RequestUtil.doGet(url, "GBK");
            Document doc = Jsoup.parse(result);
            //页面加载完成后对document进行处理,获取自己有用的数据
            parseList(doc);
            System.out.println("page=====>"+i);
        }
    }
    private static void parseList(Document doc){

        Elements elements = doc.select("div.house-listBox>div");
        int j = 0;
        for(Element element : elements){

            String name = element.select(".house-title a").first().text();
            html.save(new BasicDBObject("name",name).append("html",element.toString()))
            String regionstr = element.select("div>div>p").first().text().replace(' ','-');
            String region = regionstr.split("-")[0];
            String address = null;
            if(regionstr.split("-").length>1) {
                address = regionstr.split("-")[1] + regionstr.split("-")[2];
            } else {
                address = regionstr.split("-")[1];
            }

            String price = element.select("div>div").last().select("p").first().text();
            test.insert(new BasicDBObject("city","上海").append("region",region).append("name",name)
                    .append("avg_price",price));
            System.out.println(name);
            j++;
        }
        System.out.println(j);
    }
    private static void parseList1(Document doc) {
        Elements elements = doc.select("div.section>ul>li");
        String name = null;
        String region = null;
        String price = null;
        for (Element element : elements) {
            if (element.toString().contains("room-img")) {
                name = element.select("h5.room-name a").first().text();
                Elements datas = element.select("p");
                int i = 0;
                for (Element data : datas) {
                    i++;
                    if (i == 2) {
                        price = data.text();
                    }
                    if (i == 4) {
                        region = data.text();
                    }
                }
                System.out.println(name + price + region);
                test.insert(new BasicDBObject("city","上海").append("region",region).append("name",name)
                        .append("avg_price",price));
            }
        }
    }
}

相关doget请求自己封装了一个util,可以看看,上面的这一句String result = RequestUtil.doGet(url, "GBK");用的就是自己封装的util包,这里也可以使用jsoup自己封装的。

/**
     * 发送get请求
     * @param url
     * @return
     */

    public static String doGet(String url) {
        return doGet(url,  null, "UTF-8", false);
    }

    public static String doGet(String url, boolean encodeUrl) {
        return doGet(url,  null, "UTF-8", encodeUrl);
    }

    public static String doGet(String url, String charset) {
        return doGet(url,  null, charset, true);
    }

    public static String doGet(String url, Map<String, String> headers) {
        return doGet(url, headers, "UTF-8", true);
    }

    public static String doGet(final String url, Map<String, String> headers, String charset, boolean encodeUrl) {
        CloseableHttpClient client = HttpClients
                .custom()
                .setUserAgent(USERAGENT_CHROME)
                .build();

        CloseableHttpResponse response = null;
        String result = null;
        String requestUrl = url;
        try {
            if(encodeUrl) {
                requestUrl = encodingUrl(url, charset);
            }
            HttpGet httpGet = new HttpGet(requestUrl);
//            RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(3000).setConnectTimeout(3000).build();//设置请求和传输超时时间
//            httpGet.setConfig(requestConfig);
            if(headers != null) {
                for(Map.Entry<String, String> entry : headers.entrySet()) {
                    httpGet.addHeader(entry.getKey(), entry.getValue());
                }
            }
            response = client.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();
            if(statusCode == 200) {
                result = EntityUtils.toString(response.getEntity(), charset);
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if(response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                }
            }
            if(client != null) {
                try {
                    client.close();
                } catch (IOException e) {
                }
            }
        }
        return result;
    }

转载于:https://my.oschina.net/u/3844156/blog/1813590

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值