采集百度贴吧信息(原创)

package bean;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class GatherMessage {
 
 
 //获取静态页面的HTML源码
    public static String getHtmlContent(String strUrl, String urlCode) {  
      
        try {  
 
            URL url = new URL(strUrl);  
 
            BufferedReader bufReader = new BufferedReader(new InputStreamReader(url  
 
                    .openStream()));  
 
            String str = "";  
 
            StringBuffer strBuf = new StringBuffer("");  
 
            while ((str = bufReader.readLine()) != null) {  
 
                strBuf.append(str + "\r\n");
 
            }  
 
            bufReader.close();
           
            String strToUTF = strBuf.toString();
           
            strToUTF =new String(strToUTF.getBytes(),urlCode);
 
            return strToUTF;
 
        } catch (Exception e) {  
 
            return "error open url:" + strUrl;  
 
        }  
 
    }
   
    public static String getUrlContent(String content,String startStr, String endStr)
    {
     int startULength = startStr.length();
     
     int startU = content.indexOf(startStr);
     
     int endU = content.indexOf(endStr, startU+1);
     
     String urlContent = content.substring(startU+startULength+1, endU);
     
     return urlContent;
    }
   
    //采集超链接的地址
    public static Iterator<String> getHrefIterator(String content, String siteUrl, String startHref, String endHref)
    {
     
     int startHLength = startHref.length();
     String regex = startHref+".[^"+endHref+"]*";//正则表达式
     Pattern pattern = Pattern.compile(regex);
     Matcher matcher = pattern.matcher(content);
     List<String> hrefList = new ArrayList<String>();
     String msg = "";
     String hrefStr = "";
     while (matcher.find())
     {
      msg =matcher.group();
         hrefStr = siteUrl + msg.substring(startHLength);
         hrefList.add(hrefStr);
      }
     Iterator<String> hrefIterator = hrefList.iterator();
     return hrefIterator;
    }
   
    //根据分页规则得到要采集的URL
    public static Iterator<String> getUrlIterator(String startUrl, String urlRule)
    {
     int tempInc = Integer.parseInt(urlRule);
     int start = startUrl.indexOf("(");
     int end = startUrl.indexOf(")");
     int midle = startUrl.indexOf("-");
     int firstNum = Integer.parseInt(startUrl.substring(start+1, midle));
     int endNum = Integer.parseInt(startUrl.substring(midle+1, end));
     String leftStr = startUrl.substring(0,start);
     String rightStr = startUrl.substring(end+1, startUrl.length());
     List<String> urList = new ArrayList<String>();
     int i = firstNum;
     String urlStr = null;
     while(i<=endNum)
     {
      urlStr = leftStr + i + rightStr;
      System.out.println(urlStr);
      urList.add(urlStr);
      i = i+tempInc;
      
     }

     Iterator<String> hrefIterator = urList.iterator();
     return hrefIterator;
    }
 
   
    //采集匹配规则的所有内容
    public static String getUsefullContent(String urlStr, String startStr, String endStr, String urlCode)
    {
     try
     {
     String content = getHtmlContent(urlStr, urlCode);
     String splitcontent[] = content.split(startStr);//将得到的content,分割为若干个段,让后循环对每一段进行采集有效信息
     String usefullContent = "";
     int splitLength = startStr.length();
     int contentLength = content.length();
     int breakLength = splitcontent[0].length() + splitLength + splitcontent[1].length();
     for(int i=1; i<99999; i++)
     {
      
      if(breakLength==contentLength)//当需要采集的信息到最后一个数组时,分割数组的长度等于整体内容的长度
      {
       String tempUsefullContent = splitcontent[i];
          int endIndex = tempUsefullContent.indexOf(endStr);
          String temp = tempUsefullContent.substring(0, endIndex);
          usefullContent = usefullContent.concat(temp);
          System.out.println(breakLength);
          System.out.println(contentLength);
       break;
      }
      else
      {
      String tempUsefullContent = splitcontent[i];
      int endIndex = tempUsefullContent.indexOf(endStr);
      String temp = tempUsefullContent.substring(0, endIndex);
      usefullContent = usefullContent.concat(temp);
      }
      breakLength = breakLength + splitcontent[i+1].length() + splitLength;
     }
     //Iterator<String> hrefIterator = hrefList.iterator();
     return usefullContent;
     }
     catch(Exception e)
     {
      return "对不起,您打开链接失败,请检查网络或者您输入的url地址无效!";
     }
     
     
    }
   
    //保存到Text文档
    public static void saveToText(String Content,String fileName) throws IOException
    {
     byte [] outContent = Content.getBytes();
     
     FileOutputStream out = new FileOutputStream(fileName);
     
     out.write(outContent);
     
     out.close();
    }
   
    //读取Text文档
    public static String readText(String fileName) throws IOException
    {
     File readfile = new File(fileName);
     FileInputStream inStream = new FileInputStream(readfile);
     byte [] readContent = null;
      inStream.read(readContent);
     FileOutputStream out = new FileOutputStream("test.txt");
     out.write(readContent);     
     out.close();
     inStream.close();
     return null;
    }
   
    public static void main(String[] args) throws IOException {

     //贴吧地址
     //String url = "http://tieba.baidu.com/f?z=641553145&ct=335544320&lm=0&sc=0&rn=30&tn=baiduPostBrowser&word=%B9%ED%B4%B5%B5%C6&pn=0";
        //String siteUrl = "http://tieba.baidu.com";
         String urlCode = "gb2312";//采集页面编码
     
        //分页标示
     String startUrl = "http://tieba.baidu.com/f?z=641553145&ct=335544320&lm=0&sc=0&rn=30&tn=baiduPostBrowser&word=%B9%ED%B4%B5%B5%C6&pn=(0-60)";
     String urlRule = "30";
     
     //超链接标示
     //String startHref = "href=";
     //String endHref = ">";
     //int startHLength = startHref.length();
     
     //有用信息标示
     String startInf = "<cc>";
     String endInf = "</cc>";
     
     //String content = getHtmlContent(url, urlCode);//获取整个网页内容
     Iterator<String> urlIterator =  getUrlIterator(startUrl,urlRule);
     //String urlContent = getUrlContent(content, startUrl, urlRule);
     
     /*
     Iterator<String> hrefIterator = getHrefIterator(urlContent, siteUrl, startHref, endHref);
     */     
     int i =0;
     while(urlIterator.hasNext())
     {
      String urlString = urlIterator.next();
      String GatherhtmlContent = getUsefullContent(urlString, startInf ,endInf , urlCode);
      saveToText(GatherhtmlContent,"file"+i+".html");//将每个ULR地址采集到的信息保存到text文档中
      i++;
     }
     
     //String GatherhtmlContent = getUsefullContent("http://localhost/Test.html", startInf ,endInf , urlCode);
     //saveToText(GatherhtmlContent,"Test.html");
     System.out.println("Complete!");
   }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值