废话板块
做大数据怎么没有数据呢?对于我们,数据的来源便是爬虫。其实博主之前自己基于HTTP协议写过一个小的爬虫。所以更加明白要处理去重,解析页面。解决各种各样的小麻烦,和触发js,跳过防爬虫机制是有多么的令人闹心。所幸。有一群无私的人创造了WebCollector Java 爬虫,将这些问题的大部分全部解决。并且十分的利于二次开发。十分感谢他们的付出。这是他们的网站:WebCollector教程。
废话板块二
爬虫简介:
WebCollector是一个无须配置、便于二次开发的JAVA爬虫框架(内核),它提供精简的的API,只需少量代码即可实现一个功能强大的爬虫。WebCollector-Hadoop是WebCollector的Hadoop版本,支持分布式爬取。爬虫内核:
WebCollector致力于维护一个稳定、可扩的爬虫内核,便于开发者进行灵活的二次开发。内核具有很强的扩展性,用户可以在内核基础上开发自己想要的爬虫。源码中集成了Jsoup,可进行精准的网页解析。2.x版本中集成了selenium,可以处理javascript生成的数据。
如何搭建WebCollector,在他们的网站上很详细,并且有例子可循。再此便不坠述。
这里我直接贴上我的微博爬虫。注释写得很清楚,也就不解释太多(的确太累了!!T.T)
直接看到这里复制过去吧
使用selenium登陆微博获取cookie
package com.codsway.crawler;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.util.Set;
import javax.imageio.ImageIO;
import java.awt.BorderLayout;
import java.awt.Container;
import java.awt.Dimension;
import java.awt.Graphics;
import java.awt.Toolkit;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.image.BufferedImage;
import javax.swing.JButton;
import javax.swing.JFrame;
import javax.swing.JPanel;
import javax.swing.JTextField;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
/**
* 使用Selenium获取登录新浪微博weibo.cn的cookie
* 由于weibo.com是密文,所以只能读weibo.cn的
* @author wrm
*
*/
public class WeiboCN {
/**
* 获取新浪微博的cookie,这个方法针对weibo.cn有效,对weibo.com无效 weibo.cn以明文形式传输数据,请使用小号
* @param uname 用户名
* @param pwd 用户密码
* @return
*/
public static String getSinaCookie(String username, String password) throws Exception {
StringBuilder sb = new StringBuilder();
HtmlUnitDriver driver = new HtmlUnitDriver();
driver.setJavascriptEnabled(true);
driver.get("http://login.weibo.cn/login/");
WebElement ele = driver.findElementByCssSelector("img");
String src = ele.getAttribute("src");
String cookie = concatCookie(driver);
HttpRequest request = new HttpRequest(src);
request.setCookie(cookie);
HttpResponse response = request.getResponse();
ByteArrayInputStream is = new ByteArrayInputStream(response.getContent());
BufferedImage img = ImageIO.read(is);
is.close();
ImageIO.write(img, "png", new File("result.png"));
String userInput = new CaptchaFrame(img).getUserInput();
WebElement mobile = driver.findElementByCssSelector("input[name=mobile]");
mobile.sendKeys(username);
WebElement pass = driver.findElementByCssSelector("input[type=password]");
pass.sendKeys(password);
WebElement code = driver.findElementByCssSelector("input[name=code]");
code.sendKeys(userInput);
WebElement rem = driver.findElementByCssSelector("input[name=remember]");
rem.click();
WebElement submit = driver.findElementByCssSelector("input[name=submit]");
submit.click();
String result = concatCookie(driver);
driver.close();
if (result.contains("gsid_CTandWM")) {
return result;
} else {
throw new Exception("weibo login failed");
}
}
public static String concatCookie(HtmlUnitDriver driver) {
Set<Cookie> cookieSet = driver.manage().getCookies();
StringBuilder sb = new StringBuilder();
for (Cookie cookie : cookieSet) {
sb.append(cookie.getName() + "=" + cookie.getValue() + ";");
}
String result = sb.toString();
return result;
}
public static class CaptchaFrame {
JFrame frame;
JPanel panel;
JTextField input;
int inputWidth = 100;
BufferedImage img;
String userInput = null;
public CaptchaFrame(BufferedImage img) {
this.img = img;
}
public String getUserInput() {
frame = new JFrame("输入验证码");
final int imgWidth = img.getWidth();
final int imgHeight = img.getHeight();
int width = imgWidth * 2 + inputWidth * 2;
int height = imgHeight * 2+50;
Dimension dim = Toolkit.getDefaultToolkit().getScreenSize();
int startx = (dim.width - width) / 2;
int starty = (dim.height - height) / 2;
frame.setBounds(startx, starty, width, height);
Container container = frame.getContentPane();
container.setLayout(new BorderLayout());
panel = new JPanel() {
@Override
public void paintComponent(Graphics g) {
super.paintComponent(g);
g.drawImage(img, 0, 0, imgWidth * 2, imgHeight * 2, null);
}
};
panel.setLayout(null);
container.add(panel);
input = new JTextField(6);
input.setBounds(imgWidth * 2, 0, inputWidth, imgHeight * 2);
panel.add(input);
JButton btn = new JButton("登录");
btn.addActionListener(new ActionListener() {
@Override
public void actionPerformed(ActionEvent e) {
userInput = input.getText().trim();
synchronized (CaptchaFrame.this) {
CaptchaFrame.this.notify();
}
}
});
btn.setBounds(imgWidth * 2 + inputWidth, 0, inputWidth, imgHeight * 2);
panel.add(btn);
frame.setVisible(true);
synchronized (this) {
try {
this.wait();
} catch (InterruptedException ex) {
ex.printStackTrace();
}
}
frame.dispose();
return userInput;
}
}
}
微博内容爬取
/**
* 抽取微博 ,根据正则表达式进行过滤
*/
@Override
public void visit(Page page, CrawlDatums next) {
/*抽取用户界面:http://weibo.cn/u/XXXXX*/
String url = page.getUrl();
SinaUserInfo si = new SinaUserInfo();
List<SinaDataInfo> sd = new Vector<SinaDataInfo>();
if(page.matchUrl("http://weibo.cn/u?/?[^/,\\?]*")){ //用户页面
//1.寻找匹配的用户进行深度爬取
Elements aLink = page.select("a");
for(Element e : aLink){
String href = e.attr("href");
System.out.println("全部href:"+href);
if(href.matches("(http://weibo.cn)?(/u?/?[^/]*)")){
if(href.indexOf("http://weibo.cn")<0){
href="http://weibo.cn"+href;
}
if(href.indexOf("sinaurl")<0){
System.out.println("过了的:"+href);
next.add(new CrawlDatum(href));
}
}
}
}else if(page.matchUrl("http://weibo.cn/u?/?[^?]*\\??page=[0-9]*$")){ /*TODO:带?page=?的 这时候不用爬取用户信息,只用爬取微博,并且爬取页数要可设置*/
boolean flag = Integer.parseInt(page.getUrl().substring(page.getUrl().indexOf("page=")+5))>10;
if(flag){
return;
}
sd = weiBoCrawler(page);
sdi.addAll(sd);
//2.寻找匹配的用户进行深度爬取
Elements aLink = page.select("a");
for(Element e : aLink){
String href = e.attr("href");
if(href.matches("(http://weibo.cn)?(/u?/?[^/]*)")){
if(href.indexOf("http://weibo.cn")<0){
href="http://weibo.cn"+href;
}
if(href.indexOf("sinaurl")<0){
System.out.println("过了的:"+href);
next.add(new CrawlDatum(href));
}
}
}
}
}
爬取微博方法:
/**
* 爬取微博信息方法
* @param si 微博用户
* @param page 页面
* @return 该页面下的所有微博的集合
*/
private List<SinaDataInfo> weiBoCrawler( Page page) {
List<SinaDataInfo> sdv = new Vector<SinaDataInfo>();
String url=page.getUrl();
String wbuid = "";
//ID
if(url.indexOf("?")>0){
wbuid = url.substring(url.lastIndexOf("/")+2,url.indexOf("?"));
}else{
wbuid = url.substring(url.lastIndexOf("/")+2);
}
//昵称
String userSimpla = page.select("div.u").select("span.ctt").get(0).text();
String wbuser = userSimpla.substring(0, userSimpla.indexOf(" "));
Elements wbBox = page.select("div.c");
for(int i=0;i<2;i++)
wbBox.remove(wbBox.size()-1);
for(Element e : wbBox){
SinaDataInfo sd = new SinaDataInfo();
//微博编号
String weiboid = e.attr("id");
//内容
String content = e.select("span.ctt").text();
//平台
String platform = e.select("span.ct").text().substring(e.select("span.ct").text().indexOf(" ", 2)+1);
//地址
String address = "";
//经度
String lon = "";
//纬度
String lat = "";
//转发原文
String origincontent = "";
//赞评论转发总是最后一个
Elements rp = e.select("div").last().select("a");
//赞 倒数第4个a
String praiseStr = rp.get(rp.size()-4).text();
System.out.println("rp:"+rp+"praiseStr:"+praiseStr);
Integer praise = Integer.parseInt(praiseStr.substring(praiseStr.indexOf("[")+1, praiseStr.indexOf("]")));
// //转发 倒数第3个a
String repostStr = rp.get(rp.size()-3).text();
Integer repost = Integer.parseInt(repostStr.substring(repostStr.indexOf("[")+1, repostStr.indexOf("]")));
// //评论 倒数第2个a
String wbcommentStr = rp.get(rp.size()-3).text();
Integer wbcomment = Integer.parseInt(wbcommentStr.substring(wbcommentStr.indexOf("[")+1, wbcommentStr.indexOf("]")));
//发布日期
String published = e.select("span.ct").text().substring(0,e.select("span.ct").text().indexOf(" ", 2));
//包含class 'cmt'的为转发的 才有这里面的这些
Integer originrepost =0 ;
Integer origincomment =0 ;
if(e.select("span").hasClass("cmt")){
//原微博转发 第三个cmt
String originrepostStr = e.select("span.cmt").get(2).text();
originrepost = Integer.parseInt(originrepostStr.substring(originrepostStr.indexOf("[")+1, originrepostStr.indexOf("]")));
// //原微博评论 第四个cmt
String origincommentStr = e.select("a.cc").get(0).text();
origincomment = Integer.parseInt(origincommentStr.substring(origincommentStr.indexOf("[")+1, origincommentStr.indexOf("]")));
}
sd.setAddress(address)
.setContent(content)
.setLat(lat)
.setLon(lon)
// .setOrigincontent(origincontent)
.setPlatform(platform)
.setPraise(praise)
.setPublished(published)
.setRepost(repost)
.setWbcomment(wbcomment)
.setWeiboid(weiboid)
.setOrigincomment(origincomment)
.setOriginrepost(originrepost)
.setWbuid(wbuid)
.setWbuser(wbuser);
sdv.add(sd);
if (jdbcTemplate != null) {
int updates=jdbcTemplate.update("insert into weiboinfo"
+" (wbuser,wbuid,weiboid,content,html,platform,address,lon,lat,origincontent"
+ ",praise,repost,wbcomment,originrepost,origincomment,published) value(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
wbuser,wbuid,weiboid,content,null,platform,address,lon,lat,origincontent
,praise,repost,wbcomment,originrepost,origincomment,published);
if(updates==1){
System.out.println("mysql插入成功");
}
}
}
return sdv;
}
看着很麻烦吧,我也觉得,但这已经比自己解析页面简单得太多了。
因为好用,所以分享。码字不易,转发请注明出处:
http://blog.csdn.net/qq_28945021/article/details/52300736