package demo;
import com.google.common.base.Joiner;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.MongoClient;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import redis.clients.jedis.Jedis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import javax.swing.plaf.synth.SynthSpinnerUI;
public class RedisSpider2 {
private static MongoClient mongo = new MongoClient( "localhost" , 27017 );
private static Jedis jedis=new Jedis("localhost");
/*
private static Queue celebrate_queue=new Queue();//大V队列
private static void InputSeedIntoQueue(){
celebrate_queue.enQueue("1782270602,286");
celebrate_queue.enQueue("yiwenmu,575");
}
*/
public static String getSeedFromRedis() {
// 从Redis中获取数据并出队,uid以及connum,取出后处理分离,取出后即刻销毁
String element = jedis.lpop("follower:uid_connum");
// System.out.println("[DEBUG] : 当前使用了: " + element);
System.out.println("[DEBUG] : 现出队 -------------------- " + element);
return element;
}
public static void writeDataIntoMongo(BasicDBObject document){
MongoClient mongo;
mongo = new MongoClient( "localhost" , 27017 );
DB db = mongo.getDB("Belle"); //得到数据库
DBCollection table = db.getCollection("Following"); //拿到table
table.save(document);
}
public static void main(String[] args) throws IOException {
//InputSeedIntoQueue();
System.out.println("成功读取种子文件");
while(true){
String uid_fonum=celebrate_queue.deQueue();//读种子用户
System.out.println("[DEBUG] 大V "+uid_fonum+" 已出队列");
String[] parts = uid_fonum.split(",");//以逗号分隔开 存入parts字符串数组中
String uid = parts[0]; //分隔的第一个部分就是uid
int following = Integer.parseInt(parts[1]); //把字符串转转换成整型
System.out.println("[DEBUG]----开始爬取-----"+uid_fonum+"-------");
crawler_and_toMongo(uid,following); //种子用户开始爬
}
}
private static void crawler_and_toMongo(String uid, int following) {
float total_number = (float) (following*1.0/32 +3); //一页32个关注用户
ArrayList<String> certain_celebrate_list = new ArrayList();
for(int i=1;i<total_number;i++){
System.out.println("The current uid is "+uid+" and the page number is "+i);
String following_url = "http://tw.weibo.com/"+uid+"/follow/p/"+i;
Connection con = Jsoup.connect("http://tw.weibo.com/api/user/follow");//和url建立连接
con.data("uid",uid);
con.data("page",String.valueOf(i));
con.data("currentuid","1991953413");
con.data("page_size","32");
//设了一个cookie 以新浪用户登录的
con.header("Cookie","SINAGLOBAL=9711952802808.682.1499668084041; __gads=ID=130480cbf75a5ad6:T=1500258878:S=ALNI_MZRcFnJveJoxtD4TBbJNbD__KcD-A; UM_distinctid=15d4e658b271f5-0573ee5fbac68c-1b1d7751-1fa400-15d4e658b28944; wvr=6; UOR=,,www.shejidaren.com; _ga=GA1.2.84290483.1500258858; _gid=GA1.2.952496907.1501988163; crtg_rta=; SSOLoginState=1502013392; SCF=Agl4g4wcoIanRFGqnVTCrYVRKcGcCSGCXik1olhn6h9oPJpHfvjbBZyqPuHU-ibuQXyHNclynoHBANDkf_WRtWc.; SUB=_2A250gpeCDeRhGedH4lMY9S3Nzz-IHXVX-Y5KrDV8PUNbmtBeLUWjkW9XZkSwKQAYTJkZTvK42M-ekE_Hnw..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWEbrh.i5bvbnSvvbQHi2M35JpX5KMhUgL.Fo241K24SKepShe2dJLoIpjLxKML1KBLBo5LxK-LB--LB-2LxK-L1h-L1h2t; SUHB=0LcjYQm8zbqhfQ; ALF=1533549388; _s_tentry=-; Apache=9574205823039.291.1502013327944; ULV=1502013327950:6:1:2:9574205823039.291.1502013327944:1501421673576");
con.header("Referer",following_url); //关注
org.jsoup.nodes.Document doc;
try {
doc = con.post();
for (Element e: doc.getElementsByClass("fwBox") ){
String following_url_redirect = e.attr("href");
System.out.println(following_url_redirect);
/////////////////////////去找每一个关注的人的粉丝数 判断其是不是大V
Connection con_direct = Jsoup.connect(following_url_redirect);//和url建立连接
con_direct.data("uid",uid);
con_direct.userAgent("Mozilla");
con_direct.data("currentuid","1991953413");
con_direct.data("page_size","32");
//设了一个cookie 以新浪用户登录的
con_direct.header("Cookie","SINAGLOBAL=9711952802808.682.1499668084041; __gads=ID=130480cbf75a5ad6:T=1500258878:S=ALNI_MZRcFnJveJoxtD4TBbJNbD__KcD-A; UM_distinctid=15d4e658b271f5-0573ee5fbac68c-1b1d7751-1fa400-15d4e658b28944; wvr=6; UOR=,,www.shejidaren.com; _ga=GA1.2.84290483.1500258858; _gid=GA1.2.952496907.1501988163; crtg_rta=; SSOLoginState=1502013392; SCF=Agl4g4wcoIanRFGqnVTCrYVRKcGcCSGCXik1olhn6h9oPJpHfvjbBZyqPuHU-ibuQXyHNclynoHBANDkf_WRtWc.; SUB=_2A250gpeCDeRhGedH4lMY9S3Nzz-IHXVX-Y5KrDV8PUNbmtBeLUWjkW9XZkSwKQAYTJkZTvK42M-ekE_Hnw..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWEbrh.i5bvbnSvvbQHi2M35JpX5KMhUgL.Fo241K24SKepShe2dJLoIpjLxKML1KBLBo5LxK-LB--LB-2LxK-L1h-L1h2t; SUHB=0LcjYQm8zbqhfQ; ALF=1533549388; _s_tentry=-; Apache=9574205823039.291.1502013327944; ULV=1502013327950:6:1:2:9574205823039.291.1502013327944:1501421673576");
// con_direct.header("Referer","http://tw.weibo.com/libingbing/follow/p/12");//从李冰冰的微博关注名单的12页开始发送请求
con_direct.header("Referer",following_url_redirect); //关注
org.jsoup.nodes.Document doc_direct;
try{
doc_direct=con_direct.get();
String celebrant_following; //关注人数
String following_celebrant_uid;//大Vuid
for (Element e1: doc_direct.select("#mInfo").select("ul").select("li.fansNum").select("a").select("strong")){
String follower_num = e1.text();//抓取粉丝数
if(Long.parseLong(follower_num)>1000000)//粉丝数如果大于100万 则为大V
{
//从url里截取uid
following_celebrant_uid=following_url_redirect.substring(following_url_redirect.lastIndexOf("/")+1,following_url_redirect.length());
//爬取去该大V的关注数
Elements e2=doc_direct.select("#mInfo").select("ul").select("li.followNum").select("a").select("strong");
celebrant_following=e2.text();
//将新爬取的大V的uid和其关注数 连接成新的字符串
String new_uid_fonum=following_celebrant_uid+","+celebrant_following;
//将新爬取的大V的uid和其关注数 入队列
celebrate_queue.enQueue(new_uid_fonum);
System.out.println("[DEBUG]"+following_celebrant_uid+" 是大V且已经入队");
certain_celebrate_list.add(following_celebrant_uid);
}
}
String uids = Joiner.on(",").join(certain_celebrate_list);
BasicDBObject document = new BasicDBObject();
document.put("_id", uid); //该用户的uid
document.put("following_celebrant", certain_celebrate_list);
writeDataIntoMongo(document);
//System.out.println("[DEBUG]"+following_celebrant_uid+"以入mongo");
timeDelay(1,3);//最小为5 最大为15 单线程
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
}catch (IOException e1) {
// TODO Auto-generated catch block
//e1.printStackTrace();
timeDelay(30,35);
continue;
}
}
}
public static void timeDelay(float min, float max){
int random = (int)(max * Math.random() + min);
try {
Thread.sleep(random * 1000);//线程睡觉
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
爬虫
最新推荐文章于 2024-04-01 13:30:49 发布