java简单网络爬虫

最近项目中有用到爬虫技术,所以来了解下。找个话题练练手,我是一名租客,就想这把租房的信息给拿下来参考一下,哈哈哈

爬虫地址:58同城租房地址

http://zz.58.com/chuzu/?PGTID=0d3090a7-0015-63c6-e921-e0d02c8e6792&ClickID=2

我们想要的信息无非是:租房的户型,大小,详细地址,以及价格等(如下图)

关于解析html我用的jsoup解析的,例子中选择器写的比较繁琐(不要在意哈~),如何选择标签?F12看一下你要找内容的标签有什么特殊的class、id或是其他的属性,我要找的其中有li标签上有sortid属性,因此我的选择器是li[sortid],直接将一条数据拿到,再去找我要找的具体属性是在那个标签里,同样的方法找到具体标签拿到结果

找到途中标记的几个地方获取对应的标签,获取标签里的内容。下面上我自己的util类。

package org.myself.utils;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.myself.object.RoomInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Created by zhao_xinhu
 * On 2018/8/27
 * 简单网络爬虫工具类
 */
public class WebSpiderUtils {
    private static Logger logger = LoggerFactory.getLogger(WebSpiderUtils.class);

    /**
     * 读取传入的url的网页源码
     * @param parentUrl
     * @return
     */
    private static String getHtmlText(String parentUrl){
        HttpURLConnection connection = null;
        URL url = null;
        InputStream inputStream = null;
        BufferedReader bufferedReader = null;
        StringBuffer stringBuffer = null;
        try{
            url = new URL(parentUrl);
            connection = (HttpURLConnection) url.openConnection();
            connection.setConnectTimeout(5000);
            connection.setReadTimeout(5000);
            connection.setDoInput(true);
            connection.connect();
            inputStream = connection.getInputStream();
            bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
            stringBuffer = new StringBuffer("");
            String line = null;
            while ((line = bufferedReader.readLine()) != null){
                stringBuffer.append(line);
            }
        }catch (Exception e){
            e.printStackTrace();
        }finally {
            connection.disconnect();
            try{
                inputStream.close();
                bufferedReader.close();
            }catch (Exception e){
                e.printStackTrace();
            }
        }
        return stringBuffer.toString();
    }

    /**
     * 获取document
     * @param url
     * @return
     */
    private static Document getDocument(String url){
        String htmlText = getHtmlText(url);
        Document document = Jsoup.parse(htmlText);
        return document;
    }

    /**
     * 返回房屋想要的信息
     * @param url
     * @return
     */
    public static Map<String,List<RoomInfo>> printTarget(String url){
        Document doc = getDocument(url);
        //获取该租房总共有多少页,分页爬取
        Integer maxPage = Integer.valueOf(doc.select("div[class=\"pager\"] > a[href]").last().previousElementSibling().select("span").get(0).text());
        logger.info("一共{}页",maxPage);
        url = "http://zz.58.com/chuzu/";
        Map<String,List<RoomInfo>> result = new HashMap<>();
        for(int i = 1;i <= maxPage;i++){
            if(i > 1){
                url += "pn" + i + "/";
            }
            url += "?PGTID=0d3090a7-0015-6954-674b-2be0becb5cfb&ClickID=2";
            List<RoomInfo> roomInfos = new ArrayList<RoomInfo>();
            Document document = getDocument(url);
            //首先获取房源列表
            Elements roomUL = document.select("ul[class=\"listUl\"]");
            Elements liList = roomUL.get(0).select("li[sortid]");
//        Elements select = document.select("li[sortid=\"1535299205000\"]");
            for(Element element:liList){
                String infoURL = element.select("div[class=\"des\"]").get(0).select("h2").get(0).select("a").get(0).attr("href");

                Elements rooms = element.select("p[class=\"room\"]");
                Element room = rooms.get(0);
                String roomText = room.text();

                Elements adds = element.select("p[class=\"add\"]");
                Element address = adds.get(0);
                Elements addressInfos = address.select("a");
                Element addressInfo = addressInfos.get(0);
                String realAddress = "无确切位置";
                if(addressInfos.size() > 1){
                    realAddress = addressInfos.get(1).text();
                }
                String addressText = addressInfo.text();
                Elements price = element.select("div[class=\"listliright\"]");
                Element element1 = price.get(0);
                Elements money = element1.select("div[class=\"money\"]");
                Element realMoneyElement = money.get(0);
                String realMoney = realMoneyElement.select("b").get(0).text();
                RoomInfo roomInfo = new RoomInfo(roomText,addressText,realAddress,Integer.valueOf(realMoney),"http:" + infoURL);
//                logger.info(roomInfo.toString());
                roomInfos.add(roomInfo);
            }
            result.put("room" + i,roomInfos);
        }

        return result;
    }

}

我的房屋对象,放的就是简单的户型、地址、详细地址、价格、还有该房屋的详情url。

展开阅读全文

没有更多推荐了,返回首页