目标:做一个简单的网站爬虫(怎么听怎么象virus。。。),访问父网站下的超链接,提取里面的文本内容。
开始时,手工写HTML的标签解析,部分代码如下:
/**
* 获取Href List分析结果
*
* @return List<String>
* @throws IOException
*/
public List<String> getHrefList() throws IOException {
Set<String> rawHrefList = parseHref();
hrefList = rewriteHref(rawHrefList);
return hrefList;
}
/**
* 解析网页链接
*
* @return List<String>
* @throws IOException
*/
private List<String> parserHref() throws IOException {
List<String> rawHrefList = new ArrayList<String>();
URL url = new URL(parentUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
InputStreamReader isr = new InputStreamReader(
connection.getInputStream(), getCharset(connection.getContentType()));
BufferedReader br = new BufferedReader(isr);
String str = null;
List<String> rs = null;
while ((str = br.readLine()) != null) {
rs = getHref(str);
if (rs != null)
rawHrefList.addAll(rs);
}
br.close();
isr.close();
connection.disconnect();
return rawHrefList;
}
/**
* 获取网页编码方式
*
* @param str
* @return String
*/
private String getCharset(String str) {
Pattern pattern = Pattern.compile("charset=.*");
Matcher matcher = pattern.matcher(str);
if (matcher.find())
return matcher.group(0).split("charset=")[1];
return null;
}
/**
* 从一行字符串中读取多条链接
*
* @param str
* @return List<String>
*/
private List<String> getHref(String str) {
List<String> hrefSet = new ArrayList<String>();
Pattern pattern = Pattern.compile("href=\".*?\"");
final int URL_START_POSITION = 6;
while(str.length() > 0) {
Matcher matcher = pattern.matcher(str);
if (matcher.find()) {
// 从href中截取URL
hrefSet.add(matcher.group(0).substring(URL_START_POSITION, matcher.group(0).length()-1));
// 去掉已读取字符串
str = str.substring(matcher.end());
} else {
str = "";
}
}
if (hrefSet.size() > 0)
return hrefSet;
return null;
}
/**
* 改写
*
* @param hrefList
* @return List<String>
*/
private List<String> rewriteHref(List<String> hrefList) {
List<String> distinctHrefList = distinct(hrefList);
return distinctByParent(distinctHrefList);
}
/**
* 去重:去掉重复链接
*
* @param hrefList
* @return List<String>
*/
private List<String> distinct(List<String> hrefList) {
List<String> distinctHrefList = new ArrayList<String>();
Set<String> hrefSet = new HashSet<String>();
Iterator<String> hrefItr = hrefList.iterator();
while (hrefItr.hasNext()) {
String hrefStr = (String)hrefItr.next();
hrefSet.add(hrefStr);
}
distinctHrefList.addAll(hrefSet);
return distinctHrefList;
}
做着做着就觉得好麻烦呀,做语义分析,各种情况都要考虑,我的目标不是做一个分词工具出来,需要找个好工具才行,网上搜了一下,开始时看到html parser,不过发现代码都是2006年的,忽略,然后又看到Jsoup 和 Jericho HTML Parser,两者看起来半斤八两,不过Jericho 名字好难记,Jsoup 网站和文档好像丰富一些,OK,决定使用Jsoup 。用了才发现,太好用啦,代码清楚多了,开发效率提高,推荐!使用Jsoup后的全部代码如下(注意:现在的服务器有保护机制,同ip瞬间大量访问会被block):
package j2seTest2;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HtmlParser {
final int THREAD_COUNT = 10;
private CountDownLatch threadCompletedCounter = new CountDownLatch(THREAD_COUNT);
AtomicInteger paragraphCounter = new AtomicInteger(0);
public static void main(String[] args) {
// testParseHref();
// testGetHrefList();
// testParseParagraph();
// testCrawlParagraph();
testCrawlParagraphInMultiThreads();
}
/**
* 获取Href List分析结果
*
* @param urlStr
* @return List<String>
*/
public List<String> getHrefList(String urlStr) {
Set<String> rawHrefList = parseHref(urlStr);
return rewriteHref(rawHrefList, urlStr);
}
/**
* 解析网页链接, 去掉重复链接
*
* @param urlStr
* @return Set<String>
*/
private Set<String> parseHref(String urlStr) {
Set<String> rawHrefSet = new HashSet<String>();
Document doc;
try {
doc = Jsoup.connect(urlStr).timeout(5000).get();
Elements elements = doc.getElementsByAttribute("href");
Iterator<Element> eleItr = elements.iterator();
while(eleItr.hasNext()) {
Element element = eleItr.next();
rawHrefSet.add(element.attr("href"));
}
} catch (IOException e) {
e.printStackTrace();
}
return rawHrefSet;
}
/**
* 改写
*
* @param hrefList, parentUrl
* @return List<String>
*/
private List<String> rewriteHref(Set<String> hrefList, String parentUrl) {
return distinctByParent(hrefList, parentUrl);
}
/**
* 去重:只保留父链接开头的URL
*
* @param hrefList, parentUrl
* @return List<String>
*/
private List<String> distinctByParent(Set<String> hrefList, String parentUrl) {
ArrayList<String> distinctHrefList = new ArrayList<String>();
Iterator<String> hrefItr = hrefList.iterator();
while (hrefItr.hasNext()) {
String hrefStr = (String)hrefItr.next();
if (hrefStr.indexOf(parentUrl) >= 0) {
distinctHrefList.add(hrefStr);
}
}
return distinctHrefList;
}
/**
* 获取网页段落内容
*
* @param urlStr
* @return String
*/
public String getParagraph(String urlStr) {
return parseParagraph(urlStr);
}
/**
* 解析网页段落内容
*
* @param urlStr
* @return String
*/
private String parseParagraph(String urlStr) {
String paragraph = "";
final String HTML_TAG_P = "p";
final String HTML_TAG_FONT = "font";
Document doc;
try {
doc = Jsoup.connect(urlStr).timeout(5000).get();
paragraph = paragraph.concat(getTextByTag(doc, HTML_TAG_P));
paragraph = paragraph.concat(getTextByTag(doc, HTML_TAG_FONT));
} catch (IOException e) {
e.printStackTrace();
}
return paragraph;
}
/**
* get element's text by tag
*
* @param doc, tag
* @return String
*/
private String getTextByTag(Document doc, String tag) {
String text = "";
Elements elements = doc.getElementsByTag(tag);
Iterator<Element> eleItr = elements.iterator();
while(eleItr.hasNext()) {
Element element = eleItr.next();
text = text.concat(element.text());
}
return text;
}
/**
* crawl element's text in multi-thread
*
* @param htmlParser, hrefList
*/
private void crawlInMultiThreads(final HtmlParser htmlParser, final List<String> hrefList) {
ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT);
final Map<Integer, List<String>> hrefMap = new HashMap<Integer, List<String>>();
int sizeOfHrefSubList = hrefList.size() / THREAD_COUNT + 1;
int startIndexOfHrefSubList = 0;
int endIndexOfHrefSubList = sizeOfHrefSubList;
Integer mapIndex = 0;
// 把hrefList 按线程数分为若干个subList,再subList 存储到map中
for(;;) {
int hrefSubListIndex = 0;
List<String> hrefSubList = new ArrayList<String>();
for (int index = startIndexOfHrefSubList; index < endIndexOfHrefSubList; index++) {
hrefSubList.add(hrefSubListIndex, hrefList.get(index));
hrefSubListIndex++;
}
hrefMap.put(mapIndex, hrefSubList);
mapIndex++;
if (endIndexOfHrefSubList == hrefList.size()) {
break;
}
startIndexOfHrefSubList = endIndexOfHrefSubList;
if (hrefList.size() - endIndexOfHrefSubList > sizeOfHrefSubList) {
endIndexOfHrefSubList += sizeOfHrefSubList;
} else {
endIndexOfHrefSubList = hrefList.size();
}
}
// 一个线程处理map中的一个entry
for (int threadIndex = 0; threadIndex < THREAD_COUNT; threadIndex++) {
final Iterator<String> hrefItr = hrefMap.get(Integer.valueOf(threadIndex)).iterator();
executor.submit(new Runnable() {
public void run() {
// 每个entry有若干个href url去处理
while (hrefItr.hasNext()) {
synchronized (hrefItr) {
String paragraph = htmlParser.getParagraph(hrefItr.next());
paragraphCounter.incrementAndGet();
System.out.println("paragraphCounter:" + paragraphCounter);
System.out.println(paragraph);
System.out.println("");
}
}
threadCompletedCounter.countDown();
}
});
}
closeThreadPool(executor);
}
private void closeThreadPool(final ExecutorService executor) {
try {
threadCompletedCounter.await();
executor.shutdown();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
private static void testParseHref() {
HtmlParser a = new HtmlParser();
Set<String> hrefStr;
String parentUrl = "http://www.defense.gov/bios/";
hrefStr = a.parseHref(parentUrl);
Iterator<String> hrefItr = hrefStr.iterator();
while (hrefItr.hasNext()) {
System.out.println(hrefItr.next());
}
}
private static void testGetHrefList() {
HtmlParser a = new HtmlParser();
List<String> hrefList;
hrefList = a.getHrefList("http://www.defense.gov/bios/");
System.out.println("url count:" + hrefList.size());
for (int i = 0; i < hrefList.size(); i++)
System.out.println(hrefList.get(i));
}
private static void testParseParagraph() {
HtmlParser a = new HtmlParser();
String paragraph;
// paragraph = a.parseParagraph("http://www.defense.gov/bios/biographydetail.aspx?biographyid=430"); //<p>
paragraph = a.parseParagraph("http://www.defense.gov/bios/biographydetail.aspx?biographyid=185"); //<font>
System.out.println("paragraph:");
System.out.println(paragraph);
}
private static void testCrawlParagraph() {
HtmlParser a = new HtmlParser();
List<String> hrefList;
hrefList = a.getHrefList("http://www.defense.gov/bios/");
Iterator<String> hrefItr = hrefList.iterator();
int hrefCounter = 1;
while (hrefItr.hasNext()) {
String hrefUrl = (String) hrefItr.next();
String paragraph = a.getParagraph(hrefUrl);
System.out.println("paragraph " + hrefCounter + " from " + hrefUrl + ":");
System.out.println(paragraph);
System.out.println("");
hrefCounter++;
}
}
private static void testCrawlParagraphInMultiThreads() {
HtmlParser a = new HtmlParser();
List<String> hrefList = a.getHrefList("http://www.defense.gov/bios/");
a.crawlInMultiThreads(a, hrefList);
}
}
其它关于爬虫的介绍参考:
Java爬虫:http://blog.csdn.net/u012730840/article/details/19985897