爬虫

package demo;

import com.google.common.base.Joiner;

import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.MongoClient;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import redis.clients.jedis.Jedis;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import javax.swing.plaf.synth.SynthSpinnerUI;

public class RedisSpider2 {

    private static MongoClient mongo = new MongoClient( "localhost" , 27017 );
    private static Jedis jedis=new Jedis("localhost");
    /*
    private static Queue celebrate_queue=new Queue();//大V队列

    private static void InputSeedIntoQueue(){

        celebrate_queue.enQueue("1782270602,286");
        celebrate_queue.enQueue("yiwenmu,575");
    }
   */
    public static String getSeedFromRedis() {

        // 从Redis中获取数据并出队,uid以及connum,取出后处理分离,取出后即刻销毁
        String element = jedis.lpop("follower:uid_connum");
        // System.out.println("[DEBUG] : 当前使用了: " + element);
        System.out.println("[DEBUG] : 现出队 -------------------- " + element);
        return element;
    }
    public static void writeDataIntoMongo(BasicDBObject document){
        MongoClient mongo;
        mongo = new MongoClient( "localhost" , 27017 );
        DB db = mongo.getDB("Belle");     //得到数据库
        DBCollection table = db.getCollection("Following"); //拿到table

        table.save(document);

    }

    public static void main(String[] args) throws IOException { 

        //InputSeedIntoQueue();
        System.out.println("成功读取种子文件");

        while(true){
            String uid_fonum=celebrate_queue.deQueue();//读种子用户
            System.out.println("[DEBUG] 大V "+uid_fonum+" 已出队列");
            String[] parts = uid_fonum.split(",");//以逗号分隔开 存入parts字符串数组中
            String uid = parts[0];  //分隔的第一个部分就是uid
            int following = Integer.parseInt(parts[1]);  //把字符串转转换成整型

            System.out.println("[DEBUG]----开始爬取-----"+uid_fonum+"-------");         
            crawler_and_toMongo(uid,following);   //种子用户开始爬
        }
    }

    private static void crawler_and_toMongo(String uid, int following) {
        float total_number = (float) (following*1.0/32 +3);  //一页32个关注用户

        ArrayList<String> certain_celebrate_list = new ArrayList();

        for(int i=1;i<total_number;i++){
            System.out.println("The current uid is "+uid+" and the page number is "+i); 
            String following_url = "http://tw.weibo.com/"+uid+"/follow/p/"+i;
            Connection con = Jsoup.connect("http://tw.weibo.com/api/user/follow");//和url建立连接
            con.data("uid",uid);               
            con.data("page",String.valueOf(i));
            con.data("currentuid","1991953413");  
            con.data("page_size","32");
            //设了一个cookie 以新浪用户登录的
            con.header("Cookie","SINAGLOBAL=9711952802808.682.1499668084041; __gads=ID=130480cbf75a5ad6:T=1500258878:S=ALNI_MZRcFnJveJoxtD4TBbJNbD__KcD-A; UM_distinctid=15d4e658b271f5-0573ee5fbac68c-1b1d7751-1fa400-15d4e658b28944; wvr=6; UOR=,,www.shejidaren.com; _ga=GA1.2.84290483.1500258858; _gid=GA1.2.952496907.1501988163; crtg_rta=; SSOLoginState=1502013392; SCF=Agl4g4wcoIanRFGqnVTCrYVRKcGcCSGCXik1olhn6h9oPJpHfvjbBZyqPuHU-ibuQXyHNclynoHBANDkf_WRtWc.; SUB=_2A250gpeCDeRhGedH4lMY9S3Nzz-IHXVX-Y5KrDV8PUNbmtBeLUWjkW9XZkSwKQAYTJkZTvK42M-ekE_Hnw..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWEbrh.i5bvbnSvvbQHi2M35JpX5KMhUgL.Fo241K24SKepShe2dJLoIpjLxKML1KBLBo5LxK-LB--LB-2LxK-L1h-L1h2t; SUHB=0LcjYQm8zbqhfQ; ALF=1533549388; _s_tentry=-; Apache=9574205823039.291.1502013327944; ULV=1502013327950:6:1:2:9574205823039.291.1502013327944:1501421673576");
            con.header("Referer",following_url);  //关注
            org.jsoup.nodes.Document doc;
            try {
                doc = con.post();
                for (Element e: doc.getElementsByClass("fwBox") ){
                    String following_url_redirect = e.attr("href");
                    System.out.println(following_url_redirect);

                    /////////////////////////去找每一个关注的人的粉丝数 判断其是不是大V
                    Connection con_direct = Jsoup.connect(following_url_redirect);//和url建立连接
                    con_direct.data("uid",uid);               
                    con_direct.userAgent("Mozilla");
                    con_direct.data("currentuid","1991953413");  
                    con_direct.data("page_size","32");
                    //设了一个cookie 以新浪用户登录的
                    con_direct.header("Cookie","SINAGLOBAL=9711952802808.682.1499668084041; __gads=ID=130480cbf75a5ad6:T=1500258878:S=ALNI_MZRcFnJveJoxtD4TBbJNbD__KcD-A; UM_distinctid=15d4e658b271f5-0573ee5fbac68c-1b1d7751-1fa400-15d4e658b28944; wvr=6; UOR=,,www.shejidaren.com; _ga=GA1.2.84290483.1500258858; _gid=GA1.2.952496907.1501988163; crtg_rta=; SSOLoginState=1502013392; SCF=Agl4g4wcoIanRFGqnVTCrYVRKcGcCSGCXik1olhn6h9oPJpHfvjbBZyqPuHU-ibuQXyHNclynoHBANDkf_WRtWc.; SUB=_2A250gpeCDeRhGedH4lMY9S3Nzz-IHXVX-Y5KrDV8PUNbmtBeLUWjkW9XZkSwKQAYTJkZTvK42M-ekE_Hnw..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWEbrh.i5bvbnSvvbQHi2M35JpX5KMhUgL.Fo241K24SKepShe2dJLoIpjLxKML1KBLBo5LxK-LB--LB-2LxK-L1h-L1h2t; SUHB=0LcjYQm8zbqhfQ; ALF=1533549388; _s_tentry=-; Apache=9574205823039.291.1502013327944; ULV=1502013327950:6:1:2:9574205823039.291.1502013327944:1501421673576");
//                  con_direct.header("Referer","http://tw.weibo.com/libingbing/follow/p/12");//从李冰冰的微博关注名单的12页开始发送请求
                    con_direct.header("Referer",following_url_redirect);  //关注

                    org.jsoup.nodes.Document doc_direct;

                try{        

                   doc_direct=con_direct.get();

                   String   celebrant_following; //关注人数
                   String   following_celebrant_uid;//大Vuid

                    for (Element e1: doc_direct.select("#mInfo").select("ul").select("li.fansNum").select("a").select("strong")){
                            String follower_num = e1.text();//抓取粉丝数


                            if(Long.parseLong(follower_num)>1000000)//粉丝数如果大于100万  则为大V
                            {
                                //从url里截取uid
                                following_celebrant_uid=following_url_redirect.substring(following_url_redirect.lastIndexOf("/")+1,following_url_redirect.length());
                                //爬取去该大V的关注数
                               Elements e2=doc_direct.select("#mInfo").select("ul").select("li.followNum").select("a").select("strong");
                               celebrant_following=e2.text();
                                //将新爬取的大V的uid和其关注数 连接成新的字符串
                                String new_uid_fonum=following_celebrant_uid+","+celebrant_following;
                                //将新爬取的大V的uid和其关注数 入队列
                                celebrate_queue.enQueue(new_uid_fonum);

                             System.out.println("[DEBUG]"+following_celebrant_uid+"   是大V且已经入队");

                             certain_celebrate_list.add(following_celebrant_uid);

                            }

                        }

                     String uids = Joiner.on(",").join(certain_celebrate_list);

                        BasicDBObject document = new BasicDBObject();
                        document.put("_id", uid); //该用户的uid
                        document.put("following_celebrant", certain_celebrate_list);


                        writeDataIntoMongo(document);

                        //System.out.println("[DEBUG]"+following_celebrant_uid+"以入mongo");


                    timeDelay(1,3);//最小为5 最大为15    单线程

        } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();

            }
                    }



            }catch (IOException e1) {
                // TODO Auto-generated catch block
                //e1.printStackTrace();
                timeDelay(30,35);
                continue;
            }

        }





    }

    public static void timeDelay(float min, float max){
        int random = (int)(max * Math.random() + min);
        try {
            Thread.sleep(random * 1000);//线程睡觉
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
}
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值