由于刚接触java没多久 代码写得很丑。。
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
/**
*
* @author ivan93
*/
package wybspider;
import java.awt.Container;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import javax.swing.JButton;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JScrollPane;
import javax.swing.JTextArea;
import javax.swing.JTextField;
import javax.swing.WindowConstants;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
public class BSpider extends JFrame{
static int flag=1;
static HttpClient httpCLient = new DefaultHttpClient();
static String file="";
private JTextField turl=new JTextField();
private JTextField tfile=new JTextField();
private JTextField tpage=new JTextField();
private JTextArea jt=new JTextArea();
private JScrollPane sp=new JScrollPane(jt);
private JButton crawing=new JButton("开始爬");
private JLabel jurl=new JLabel("网址:");
private JLabel jfile=new JLabel("储存文件名:");
private JLabel jpage=new JLabel("爬的页数");
public BSpider(){
setTitle("B站小爬虫");
setLayout(null);
setBounds(0,0,700,500);
Container c=getContentPane();
jurl.setBounds(50, 50, 50, 50);
turl.setBounds(100, 60, 400, 30);
c.add(jurl);
c.add(turl);
jfile.setBounds(50, 180, 50, 50);
tfile.setBounds(100, 190, 200, 30);
c.add(jfile);
c.add(tfile);
jpage.setBounds(50, 310, 50, 50);
tpage.setBounds(100, 320, 200, 30);
c.add(jpage);
c.add(tpage);
crawing.setBounds(100, 400, 100, 50);
c.add(crawing);
setVisible(true);
setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE);
}
public static String uncompressToString(InputStream in,String charset)//由于网址为gzip格式,所以需要先解压
{
ByteArrayOutputStream out = new ByteArrayOutputStream();
try {
GZIPInputStream gunzip = new GZIPInputStream(in);
byte[] buffer = new byte[256];
int n;
while ((n = gunzip.read(buffer)) >= 0) {
out.write(buffer, 0, n);
}
return out.toString(charset);
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
static void gettext(String str) throws FileNotFoundException//正则表达式抓取链接
{
int count=0;
String link="";
String title="";
String gk,sc,dm,up;
Pattern pattern = Pattern
.compile("<div class=\"l-item\"><a href=\"/video/av(.+?)/\".+?class=\"title\" title=\"(.+?)\".+?title=\"观看\" number=\"(.+?)\".+?title=\"收藏\" number=\"(.+?)\".+?title=\"弹幕\" number=\"(.+?)\".+?<i class=\"up r10000\">(.+?)</i>");//抓B站视频的链接
Matcher matcher = pattern.matcher(str);//抓提问题超链接
Boolean isFind = matcher.find();
StringBuilder sb=new StringBuilder();
PrintStream printStream = new PrintStream(new FileOutputStream(file,true));
while (isFind) {
link="http://www.bilibili.com/video/av"+ matcher.group(1);
title=matcher.group(2);
gk=matcher.group(3);
sc=matcher.group(4);
dm=matcher.group(5);
up=matcher.group(6);
System.out.println("链接:"+link+"\t标题:"+title+"\t观看:"+gk+"\t收藏:"+sc+"\t弹幕:"+dm+"\tup主:"+up);
isFind = matcher.find();
sb.append("<tr>"); //写成html文件
sb.append("<td>"); sb.append(title); sb.append("</td>");
sb.append("<td>"); sb.append(gk); sb.append("</td>");
sb.append("<td>"); sb.append(sc); sb.append("</td>");
sb.append("<td>"); sb.append(dm); sb.append("</td>");
sb.append("<td>"); sb.append(up); sb.append("</td>");
sb.append("<td>"); sb.append("<a href=");sb.append(link);sb.append(">"); sb.append("链接"); sb.append("</a>"); sb.append("</td>");
sb.append("</tr>");
}
printStream.println(sb.toString());
}
public static String downloadPage(String path) throws Exception //下载整个网站的html代码
{
// 定义输入输出流
HttpGet httpget = new HttpGet(path);
String result="";
String content="";
try
{
// 客户端执行get请求 返回响应实体
HttpResponse response = httpCLient.execute(httpget);
// 获取响应消息实体
HttpEntity entity = response.getEntity();
if(entity != null){
// result=EntityUtils.toString(entity,"utf-8");
InputStream in =entity.getContent();
// System.out.println(entity.getContentEncoding());
content=uncompressToString(in,"utf-8");
//System.out.println(content);
httpget.abort();
}
} catch (ClientProtocolException e){
} catch (IOException e){
}
return content;
}
public static void changepage(String url,int page,int topage) throws Exception //实现翻页
{
String newurl=url.substring(0,url.length()-6);
String temp;
// System.out.println(newurl);
for(int i=page;i<=topage;i++)
{
temp=newurl;
String s = Integer.toString(i);
newurl=newurl+s+".html";
// System.out.println(newurl);
String str=BSpider.downloadPage(newurl);
gettext(str);
newurl=temp;
}
}
public static void main(String[] args) throws Exception {
//System.out.println(downloadPage("http://www.bilibili.com/video/douga-mad-1.html"));
// String str="<a href=\"/video/av3052581/\"></a>";
BSpider spider=new BSpider();
int page=1;
// int topage=20;
// String url1="http://www.bilibili.com/video/dance-1.html";
// String url2="http://www.bilibili.com/video/bangumi-two-1.html";
StringBuilder sb=new StringBuilder();
StringBuilder sb1=new StringBuilder();
spider.crawing.addActionListener(new ActionListener(){
public void actionPerformed(ActionEvent e)
{
String url1=spider.turl.getText();
file=spider.tfile.getText()+".html";
String temp=spider.tpage.getText();
int topage = Integer.valueOf(temp).intValue();
File file1=new File(file);
if(file1.exists())
file1.delete();
PrintStream printStream = null;
try {
printStream = new PrintStream(new FileOutputStream( file,true));
} catch (FileNotFoundException ex) {
Logger.getLogger(BSpider.class.getName()).log(Level.SEVERE, null, ex);
}
sb.append("<html xmlns=\"http://www.w3.org/1999/xhtml\">");
sb.append("<head>");
sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
sb.append("<title>无标题文档</title>");
sb.append("</head>");
sb.append("<body>");
sb.append("<table border=\"2\" align=\"center\" >");
sb.append("<tr>");
sb.append("<td width=\"700\">标题</td>"); sb.append("<td width=\"100\">观看</td>"); sb.append("<td width=\"100\">收藏</td>"); sb.append("<td width=\"100\">弹幕</td>"); sb.append("<td width=\"200\">up主</td>"); sb.append("<td width=\"100\">链接</td>");
sb.append("</tr>");
printStream.println(sb.toString());
try {
changepage(url1,page,topage);
} catch (Exception ex) {
Logger.getLogger(BSpider.class.getName()).log(Level.SEVERE, null, ex);
}
sb1.append("</table>");
sb1.append("</body>");
sb1.append("</html>");
printStream.println(sb1.toString());
}
});
}
}
第一个:输入要抓取的B网站,注意是 B站对应那个分类的链接,如 动画-MAD-AMV这个链接为:http://www.bilibili.com/video/douga-mad-1.html
音乐-翻唱链接为:http://www.bilibili.com/video/music-Cover-1.html
第二个:本地保存的名字,不需要输入文件后缀,保存位置在java当前文件夹
第三个:要抓取的页数,填数字即可。