B站小爬虫

最新推荐文章于 2024-02-25 18:43:15 发布
dessy93
最新推荐文章于 2024-02-25 18:43:15 发布
阅读量987
点赞数
本文链接：https://blog.csdn.net/dessy93/article/details/49644607
版权
由于刚接触java没多久代码写得很丑。。
/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

/**
 *
 * @author ivan93
 */
package wybspider;

import java.awt.Container;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import javax.swing.JButton;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JScrollPane;
import javax.swing.JTextArea;
import javax.swing.JTextField;
import javax.swing.WindowConstants;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;




public class BSpider extends JFrame{
    
    static int flag=1;   
    static HttpClient httpCLient = new DefaultHttpClient(); 
    static String file="";
    
       private JTextField turl=new JTextField();
       private JTextField tfile=new JTextField();
       private JTextField tpage=new JTextField();
        private JTextArea jt=new JTextArea();
        private JScrollPane sp=new JScrollPane(jt);
        private JButton crawing=new JButton("开始爬");
        private JLabel jurl=new JLabel("网址:");
        private JLabel jfile=new JLabel("储存文件名:");
        private JLabel jpage=new JLabel("爬的页数");
 
    
    public BSpider(){
      
        setTitle("B站小爬虫");
         setLayout(null);
         setBounds(0,0,700,500);
         Container c=getContentPane();
         jurl.setBounds(50, 50, 50, 50);
         turl.setBounds(100, 60, 400, 30);
         c.add(jurl);
         c.add(turl);
         jfile.setBounds(50, 180, 50, 50);
         tfile.setBounds(100, 190, 200, 30);
         c.add(jfile);
         c.add(tfile);
         jpage.setBounds(50, 310, 50, 50);
         tpage.setBounds(100, 320, 200, 30);
         c.add(jpage);
         c.add(tpage);
         
         crawing.setBounds(100, 400, 100, 50);
         c.add(crawing);
      
         setVisible(true);
        setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE);
        
        
    }
    
        public static String uncompressToString(InputStream in,String charset)//由于网址为gzip格式，所以需要先解压
        {  
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        try {
            GZIPInputStream gunzip = new GZIPInputStream(in);
            byte[] buffer = new byte[256];
            int n;
            while ((n = gunzip.read(buffer)) >= 0) {
                out.write(buffer, 0, n);
            }
            return out.toString(charset);
        } catch (IOException e) {
                e.printStackTrace();
        }
            return null;
        }
        
        
static void gettext(String str) throws FileNotFoundException//正则表达式抓取链接
{
    
 int count=0; 
 String link="";
 String title="";
 String gk,sc,dm,up;
 
Pattern pattern = Pattern
    .compile("<div class=\"l-item\"><a href=\"/video/av(.+?)/\".+?class=\"title\" title=\"(.+?)\".+?title=\"观看\" number=\"(.+?)\".+?title=\"收藏\" number=\"(.+?)\".+?title=\"弹幕\" number=\"(.+?)\".+?<i class=\"up r10000\">(.+?)</i>");//抓B站视频的链接
  Matcher matcher = pattern.matcher(str);//抓提问题超链接
   Boolean isFind = matcher.find();

StringBuilder sb=new StringBuilder();
PrintStream printStream = new PrintStream(new FileOutputStream(file,true)); 


  while (isFind) {
    link="http://www.bilibili.com/video/av"+ matcher.group(1);
    title=matcher.group(2);
    gk=matcher.group(3);
    sc=matcher.group(4);
    dm=matcher.group(5);
    up=matcher.group(6);
    System.out.println("链接:"+link+"\t标题:"+title+"\t观看:"+gk+"\t收藏:"+sc+"\t弹幕:"+dm+"\tup主:"+up);
    isFind = matcher.find();
    
sb.append("<tr>");                                                  //写成html文件
sb.append("<td>");  sb.append(title);  sb.append("</td>");   
sb.append("<td>");  sb.append(gk);  sb.append("</td>");  
sb.append("<td>");  sb.append(sc);  sb.append("</td>");  
sb.append("<td>");  sb.append(dm);  sb.append("</td>");  
sb.append("<td>");  sb.append(up);  sb.append("</td>");  
sb.append("<td>"); sb.append("<a href=");sb.append(link);sb.append(">"); sb.append("链接"); sb.append("</a>"); sb.append("</td>");  
sb.append("</tr>");
  
  }
 
printStream.println(sb.toString());

}
        
    
    public static String downloadPage(String path) throws Exception //下载整个网站的html代码
    {
  // 定义输入输出流
     
     HttpGet httpget = new HttpGet(path);  
      String result="";
      String content="";

            try  
        {       
            
            // 客户端执行get请求 返回响应实体  
            HttpResponse response = httpCLient.execute(httpget);         
            // 获取响应消息实体  
            HttpEntity entity = response.getEntity();  
               if(entity != null){  
        //   result=EntityUtils.toString(entity,"utf-8");
           InputStream in =entity.getContent();
          // System.out.println(entity.getContentEncoding());
           content=uncompressToString(in,"utf-8");
           //System.out.println(content);
          httpget.abort();
            }  
              
        } catch (ClientProtocolException e){  
            
        } catch (IOException e){  
           
        }
       
       return content;     
    }
            
   public static void changepage(String url,int page,int topage) throws Exception  //实现翻页
   {
       
        String newurl=url.substring(0,url.length()-6);
        String temp;
      //  System.out.println(newurl);
        for(int i=page;i<=topage;i++)
        {
           temp=newurl;
            String s = Integer.toString(i);
            newurl=newurl+s+".html";
      //    System.out.println(newurl);
   
            String str=BSpider.downloadPage(newurl);
            
            gettext(str);
         
             newurl=temp;
            
        }
       
   }
   
    public static void main(String[] args) throws Exception {
       //System.out.println(downloadPage("http://www.bilibili.com/video/douga-mad-1.html"));
       // String str="<a href=\"/video/av3052581/\"></a>";
        BSpider spider=new BSpider();
        int page=1;
//        int topage=20;
//        String url1="http://www.bilibili.com/video/dance-1.html";      
//        String url2="http://www.bilibili.com/video/bangumi-two-1.html";
        
       
        
StringBuilder sb=new StringBuilder();
StringBuilder sb1=new StringBuilder();
 

        spider.crawing.addActionListener(new ActionListener(){
                    public void actionPerformed(ActionEvent e)
                    {
                        
                    String url1=spider.turl.getText();
                          file=spider.tfile.getText()+".html";
                          String temp=spider.tpage.getText();
                          int topage = Integer.valueOf(temp).intValue(); 
                          
                          File file1=new File(file);
                          if(file1.exists())
                            file1.delete();
                          
PrintStream printStream = null;

                        try {
                            printStream = new PrintStream(new FileOutputStream( file,true));
                        } catch (FileNotFoundException ex) {
                            Logger.getLogger(BSpider.class.getName()).log(Level.SEVERE, null, ex);
                        }
sb.append("<html xmlns=\"http://www.w3.org/1999/xhtml\">"); 
sb.append("<head>"); 
sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"); 
sb.append("<title>无标题文档</title>"); 
sb.append("</head>"); 
sb.append("<body>"); 
sb.append("<table border=\"2\" align=\"center\"  >"); 
sb.append("<tr>"); 
sb.append("<td width=\"700\">标题</td>"); sb.append("<td width=\"100\">观看</td>"); sb.append("<td width=\"100\">收藏</td>"); sb.append("<td width=\"100\">弹幕</td>"); sb.append("<td width=\"200\">up主</td>"); sb.append("<td width=\"100\">链接</td>"); 
sb.append("</tr>");

 printStream.println(sb.toString());
                        try {
                            changepage(url1,page,topage);
                        } catch (Exception ex) {
                            Logger.getLogger(BSpider.class.getName()).log(Level.SEVERE, null, ex);
                        }
        
sb1.append("</table>"); 
sb1.append("</body>"); 
sb1.append("</html>");   
printStream.println(sb1.toString()); 

    } 
                   
            });
    }
    
}
上位机有三个输入
第一个：输入要抓取的B网站，注意是 B站对应那个分类的链接，如动画-MAD-AMV这个链接为：http://www.bilibili.com/video/douga-mad-1.html
音乐-翻唱链接为：http://www.bilibili.com/video/music-Cover-1.html
第二个：本地保存的名字，不需要输入文件后缀，保存位置在java当前文件夹
第三个：要抓取的页数，填数字即可。