用Java程序爬取网页地址,电话

1.test.java

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.HashMap;

import java.util.Map;

public class test {

    public static void main(String[] args) throws Exception {

        int[] shopIdList=new int[]{
                50312563,
                92876536,
                22097321,
                41051116,
                13113097,
                68892758,
                73445642,
                13111733,
                73477278,
                93793067,
                93689649,
                80808017,
                62279433,
                73488605,
                50342122,
                92227067,
                44163423,
                79249736,
                94046038,
                32628063,
                13113771,
                68277921,
                13110873,
                13113505,
                13111764,
                48297440,
                73484839,
                41459573,
                69358429,
                73443708,
                71249905,
                10538942,
                13111070,
                13112312,
                41259426,
                62996680,
                48404120,
                44142507,
                43776712,
                45352099,
                70075166,
                79107710,
                73450151,
                16671845,
                13111241,
                13110710,
                70159730,
                93434655,
                65823268,
                73494608,
                97173829,
                43760043,
                78998150,
                70411653,
                97048620,
                23542698,
                93299200,
                62380363,
                13112989,
                92525324,
                9592171,
                92887084,
                28616571,
                67655539,
                16713892,
                13119853,
                21023053,
                13120114,
                19451694,
                13120361,
                19817673,
                17724275,
                13120349,
                70197364,
                44017351,
                62912981,
                77708553,
                77863895,
                13120201,
                24640426,
                16322224,
                13120008,
                62984143,
                44138422,
                62842193,
                13120213,
                13120380,
                13119913,
                14630595,
                62370768,
                13120249,
                69886952,
                13120122,
                14630594,
                13119831,
                77936903,
                13120129,
                66565778,
                77856662,
                58929641,
                80709972,
                62997305,
                67114391,
                77675166,
                79038201,
                80661103,
                13120464,
                16673316,
                92922849,
                77934891,
                77611980,
                13120265,
                70355164,
                38204276,
                13120007,
                21020766,
                13120024,
                63619708,
                63504700,
                78008237,
                13120261,
                13120391,
                14630470,
                15576737,
                69474183,
                16473023,
                90075588,
                77825833,
                62902568,
                43757568,
                13120151,
                44274667,
                90081825,
                44269009,
                77977776,
                93217968,
                43914698,
                92927515,
                71723659,
                76875545,
                40245158,
                97074853,
                97075050,
                97074960,
                96688405
        };

        String[] uas=new String[] {
                "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; QQDownload 734; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.5; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; QQBrowser/7.4.14016.400)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.5; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; InfoPath.3; KB974488)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; chromeframe/28.0.1500.72; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; Tablet PC 2.0; QQBrowser/7.4.14018.400)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; WebSaver; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET4.0C; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; InfoPath.2)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; QQBrowser/7.4.14018.400)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; InfoPath.3)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; .NET CLR 2.0.50727; InfoPath.3; QQBrowser/7.3.11251.400)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; .NET CLR 2.0.50727; .NET4.0C; .NET4.0E; InfoPath.2)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1;  Embedded Web Browser from: http://bsalsa.com/; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SMT Player 3.9.1.430; .NET CLR 2.0.50727; LBBROWSER)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQDownload 718; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQDownload 677; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQBrowser/7.4.14018.400)"
        };
        try {
            for(int i=0;i<shopIdList.length;i++) {
                getHtml("https://m.dianping.com/shop/"+shopIdList[i],shopIdList[i],uas[i/10]);
                Thread.sleep(5000);
            }
        } catch (Exception e) {

        }
    }


    /**
     * 解析json数据获取相关信息
     * @param url
     * @throws Exception
     */
    public static void getHtml(String url,int shopId,String ua) throws Exception {

        Connection conInfo = Jsoup.connect(url);
        conInfo.timeout(5000);
        //conInfo.followRedirects(true);
        //conInfo.ignoreContentType(false);
        //conInfo.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");

        //String ua="Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; SE 2.X MetaSr 1.0)";
        //"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"

        conInfo.userAgent(ua);
        //Map<String, String> cookies=new HashMap<>();
        //cookies.put("default_ab","shop%3AA%3A1");
        //cookies.put("_lxsdk_s","%7C%7C0");
        //conInfo.cookies(cookies);

        Document doc = null;
        Connection.Response response = null;
        try {
            doc  = conInfo.get();
            parseCommentInfo(doc,shopId);
        } catch (Exception e) {

        }

        //某地区娱乐
        //JsonCommentParse.getDate(doc);
        //  JsonCommentKCShopping.getDate(doc);
    }



    /**
     * 获取地址,电话
     * @param doc
     * @param shopId
     */
    private static void parseCommentInfo(Document doc, int shopId) {
        // List<CommentMDSP> commentMDSPList = new ArrayList<CommentMDSP>();
        Map<String, Object> map = new HashMap<String, Object>();
        String zxphone = null;
        String address = null;
        //获取地址
//        String address1 = doc.select(".J_address .info-list.link-list .item").text();
//        Elements address2 = doc.select(".info-details .add");
//        Elements elements = doc.select("#lego-widget-shopping-mall-header-free-000-000");

        String address1 = doc.select(".J_address .info-list.link-list .item").text();
        Elements address2 = doc.select(".info-details .add");
        Elements elements = doc.select("#lego-widget-shopping-mall-header-free-000-000");
        if(!(address1.isEmpty())){
            address = address1;
            //获取电话
            zxphone = doc.select(".J_phone .info-list.link-list .item ").text();
        }
        if(address2 != null && address2.size()>0){
            //获取地址
            for(Element address3 : address2){
                address = address3.select("a:nth-child(1)").text();
                //获取电话
                String zxphone1 = doc.select("a:nth-child(2)").attr("href");
                int i = zxphone1.indexOf(":");
                zxphone = zxphone1.substring(i);
            }
        }
        if(elements != null && elements.size()>0){
            for(Element ele : elements){
                address = ele.select(".mall-location span").text();
            }

        }

        map.put("address",address);
        map.put("zxphone",zxphone);
        map.put("shopId",shopId);

//        if(elements != null && elements.size()>0){
//            for(Element ele : elements){
//                address = ele.select(".mall-location span").text();
//            }
//
//        }

//        CommentMDSP commentMDSP = new CommentMDSP();
//        commentMDSP.setAddress(address);
        //commentMDSP.setZxphone(zxphone);
        map.put("address", address);
        map.put("zxphone", zxphone);
        map.put("shopId", shopId);
        System.out.println(" update s_tainment set address='"+address+"',zxphone='"+zxphone+"' where shopId="+shopId+";");


    }

}

2.pom.xml

 <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.8.3</version>
        </dependency>

3.注意点

这里的pom.xml导入依托的jar包,然后执行test.java主函数,输出指定网站中爬取的地址和电话信息;

同时这里的UA设置这么多,是因为网站限制访问次数

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值