package bean;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GatherMessage {
//获取静态页面的HTML源码
public static String getHtmlContent(String strUrl, String urlCode) {
try {
URL url = new URL(strUrl);
BufferedReader bufReader = new BufferedReader(new InputStreamReader(url
.openStream()));
String str = "";
StringBuffer strBuf = new StringBuffer("");
while ((str = bufReader.readLine()) != null) {
strBuf.append(str + "\r\n");
}
bufReader.close();
String strToUTF = strBuf.toString();
strToUTF =new String(strToUTF.getBytes(),urlCode);
return strToUTF;
} catch (Exception e) {
return "error open url:" + strUrl;
}
}
public static String getUrlContent(String content,String startStr, String endStr)
{
int startULength = startStr.length();
int startU = content.indexOf(startStr);
int endU = content.indexOf(endStr, startU+1);
String urlContent = content.substring(startU+startULength+1, endU);
return urlContent;
}
//采集超链接的地址
public static Iterator<String> getHrefIterator(String content, String siteUrl, String startHref, String endHref)
{
int startHLength = startHref.length();
String regex = startHref+".[^"+endHref+"]*";//正则表达式
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
List<String> hrefList = new ArrayList<String>();
String msg = "";
String hrefStr = "";
while (matcher.find())
{
msg =matcher.group();
hrefStr = siteUrl + msg.substring(startHLength);
hrefList.add(hrefStr);
}
Iterator<String> hrefIterator = hrefList.iterator();
return hrefIterator;
}
//根据分页规则得到要采集的URL
public static Iterator<String> getUrlIterator(String startUrl, String urlRule)
{
int tempInc = Integer.parseInt(urlRule);
int start = startUrl.indexOf("(");
int end = startUrl.indexOf(")");
int midle = startUrl.indexOf("-");
int firstNum = Integer.parseInt(startUrl.substring(start+1, midle));
int endNum = Integer.parseInt(startUrl.substring(midle+1, end));
String leftStr = startUrl.substring(0,start);
String rightStr = startUrl.substring(end+1, startUrl.length());
List<String> urList = new ArrayList<String>();
int i = firstNum;
String urlStr = null;
while(i<=endNum)
{
urlStr = leftStr + i + rightStr;
System.out.println(urlStr);
urList.add(urlStr);
i = i+tempInc;
}
Iterator<String> hrefIterator = urList.iterator();
return hrefIterator;
}
//采集匹配规则的所有内容
public static String getUsefullContent(String urlStr, String startStr, String endStr, String urlCode)
{
try
{
String content = getHtmlContent(urlStr, urlCode);
String splitcontent[] = content.split(startStr);//将得到的content,分割为若干个段,让后循环对每一段进行采集有效信息
String usefullContent = "";
int splitLength = startStr.length();
int contentLength = content.length();
int breakLength = splitcontent[0].length() + splitLength + splitcontent[1].length();
for(int i=1; i<99999; i++)
{
if(breakLength==contentLength)//当需要采集的信息到最后一个数组时,分割数组的长度等于整体内容的长度
{
String tempUsefullContent = splitcontent[i];
int endIndex = tempUsefullContent.indexOf(endStr);
String temp = tempUsefullContent.substring(0, endIndex);
usefullContent = usefullContent.concat(temp);
System.out.println(breakLength);
System.out.println(contentLength);
break;
}
else
{
String tempUsefullContent = splitcontent[i];
int endIndex = tempUsefullContent.indexOf(endStr);
String temp = tempUsefullContent.substring(0, endIndex);
usefullContent = usefullContent.concat(temp);
}
breakLength = breakLength + splitcontent[i+1].length() + splitLength;
}
//Iterator<String> hrefIterator = hrefList.iterator();
return usefullContent;
}
catch(Exception e)
{
return "对不起,您打开链接失败,请检查网络或者您输入的url地址无效!";
}
}
//保存到Text文档
public static void saveToText(String Content,String fileName) throws IOException
{
byte [] outContent = Content.getBytes();
FileOutputStream out = new FileOutputStream(fileName);
out.write(outContent);
out.close();
}
//读取Text文档
public static String readText(String fileName) throws IOException
{
File readfile = new File(fileName);
FileInputStream inStream = new FileInputStream(readfile);
byte [] readContent = null;
inStream.read(readContent);
FileOutputStream out = new FileOutputStream("test.txt");
out.write(readContent);
out.close();
inStream.close();
return null;
}
public static void main(String[] args) throws IOException {
//贴吧地址
//String url = "http://tieba.baidu.com/f?z=641553145&ct=335544320&lm=0&sc=0&rn=30&tn=baiduPostBrowser&word=%B9%ED%B4%B5%B5%C6&pn=0";
//String siteUrl = "http://tieba.baidu.com";
String urlCode = "gb2312";//采集页面编码
//分页标示
String startUrl = "http://tieba.baidu.com/f?z=641553145&ct=335544320&lm=0&sc=0&rn=30&tn=baiduPostBrowser&word=%B9%ED%B4%B5%B5%C6&pn=(0-60)";
String urlRule = "30";
//超链接标示
//String startHref = "href=";
//String endHref = ">";
//int startHLength = startHref.length();
//有用信息标示
String startInf = "<cc>";
String endInf = "</cc>";
//String content = getHtmlContent(url, urlCode);//获取整个网页内容
Iterator<String> urlIterator = getUrlIterator(startUrl,urlRule);
//String urlContent = getUrlContent(content, startUrl, urlRule);
/*
Iterator<String> hrefIterator = getHrefIterator(urlContent, siteUrl, startHref, endHref);
*/
int i =0;
while(urlIterator.hasNext())
{
String urlString = urlIterator.next();
String GatherhtmlContent = getUsefullContent(urlString, startInf ,endInf , urlCode);
saveToText(GatherhtmlContent,"file"+i+".html");//将每个ULR地址采集到的信息保存到text文档中
i++;
}
//String GatherhtmlContent = getUsefullContent("http://localhost/Test.html", startInf ,endInf , urlCode);
//saveToText(GatherhtmlContent,"Test.html");
System.out.println("Complete!");
}
}
采集百度贴吧信息(原创)
最新推荐文章于 2021-08-30 17:51:00 发布