顾名思义,爬虫就是在internet上把我们需要的内容抓回来的程序,google每天都从internet上爬回无数链接来维护它的数据库。这里是一个简单的爬虫,针对MSDN上有名的TheOldNewThing,按post、日期分类,把所有的links抓回来。
BlogLayoutInterface.java
import java.io.*;
// Adapter Interface
public interface BlogLayoutInterface{
void parse(String content) throws Exception;
void findEmbeddedLinks(String baseUrl, String content, PrintWriter pw)
throws Exception;
}
BlogLayout.java
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
// Default Adapter
public abstract class BlogLayout implements BlogLayoutInterface{
protected String strURL;
protected URL url;
protected String file;
protected String username;
protected String password;
public void parse(String content) throws Exception {}
public void findEmbeddedLinks(String baseUrl, String content,
PrintWriter pw) throws Exception {}
public BlogLayout(String strURL, String username, String password)
throws MalformedURLException{
this.strURL = strURL;
url = new URL(strURL);
this.username = username;
this.password = password;
}
}
MSDNLayout.java
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class MSDNLayout extends BlogLayout{
private static Pattern pattern;
static{
final int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL
| Pattern.MULTILINE | Pattern.UNICODE_CASE
| Pattern.CANON_EQ | Pattern.UNIX_LINES;
String regexp = "<a(.*?)//shref//s*=//s*([///" + "([///"//'//s].*?>|>)(.*?)</a>";
MSDNLayout.pattern = Pattern.compile(regexp, flags);
}
public MSDNLayout(String strURL, String username, String password)
throws MalformedURLException{
super(strURL, username, password);
}
/***************************************************************************
* this method perform in 3 steps, first get all archive links from homepage
* (or first page, that is, the original links user input). usually, archive
* links are organized by time, say, month. so, in second step, get all
* calendar info for each archive. finally, parse the every calendar links
* which contains post. for this blog.msdn.com system, one page can only
* display 20 posts and no "next page" function provided, if there are more
* than 20 posts in one archive(or month), the rest posts can only be
* accessed through the calendar. so retrieving all posts by taking
* advantage of the calendar makes more sense in this case.
**************************************************************************/
public void parse(String content) throws Exception {
// this page dont welcome robot:(
if(HttpUtil.rejectRobots(content)) return;
List ArchivesList = new ArrayList();
Matcher matcher = MSDNLayout.pattern.matcher(content);
String domain = null;
try{
domain = HttpUtil.getDomainFromUrl(strURL);
}catch(MalformedURLException ex){
ex.printStackTrace();
}
// baseURI is the path component of a URL
String baseURI = null;
try{
baseURI = HttpUtil.getBaseUriFromUrl(strURL);
}catch(MalformedURLException ex){
ex.printStackTrace();
}
String currentUrl;
// blog defined archives links
String archiveLinks = "_ctl0__ctl0__ctl0__ctl0_BlogSideBar1__ctl0"
+ "__ctl1__ctl0_Categories_";
// find all archive links
while(matcher.find()){
if(matcher.group(1).contains(archiveLinks)){
currentUrl = HttpUtil.canonizeURL(domain, baseURI, matcher
.group(3));
try{
URL javaCurrentUrl = new URL(currentUrl);
ArchivesList.add(currentUrl);
}catch(MalformedURLException ex){
continue;
}
}
}
List CalendarLists = new ArrayList();
// find all calendar links
Iterator it_al = ArchivesList.iterator();
while(it_al.hasNext()){
// this mycontent contains every archive, which should
// be used to collect calendar info later.
String mycontent = HttpUtil.openConnection((new URL(
(String)it_al.next())), username, password);
if(HttpUtil.rejectRobots(mycontent)) continue;
matcher = MSDNLayout.pattern.matcher(mycontent);
// blog defined calendar links
String calendarLinks = "title=/"";
while(matcher.find()){
if(matcher.group(1).contains(calendarLinks)){
currentUrl = HttpUtil.canonizeURL(domain, baseURI,
matcher.group(3));
try{
URL javaCurrentUrl = new URL(currentUrl);
CalendarLists.add(currentUrl);
}catch(MalformedURLException ex){
continue;
}// try/catch
}// if
}// while
}// while
// initialize the file
file = url.getHost() + ".txt";
PrintWriter pw = new PrintWriter(new BufferedWriter(
new FileWriter(file)));
Iterator it_cl = CalendarLists.iterator();
while(it_cl.hasNext()){
String mycontent = HttpUtil.openConnection((new URL(
(String)it_cl.next())), username, password);
if(HttpUtil.rejectRobots(mycontent)) continue;
findEmbeddedLinks(strURL, mycontent, pw);
}
pw.close();
System.out.println("Crawling finished for " + url.getHost());
}
public void findEmbeddedLinks(String baseUrl, String content,
PrintWriter pw) throws IOException {
if(HttpUtil.rejectRobots(content)) return;
List list = new ArrayList();
Matcher matcher = MSDNLayout.pattern.matcher(content);
String domain = null;
try{
domain = HttpUtil.getDomainFromUrl(baseUrl);
}catch(MalformedURLException ex){
ex.printStackTrace();
}
// baseURI is the path component of a URL
String baseURI = null;
try{
baseURI = HttpUtil.getBaseUriFromUrl(baseUrl);
}catch(MalformedURLException ex){
ex.printStackTrace();
}
String currentUrl = "";
int flag = 0;
while(matcher.find()){
while(matcher.group(1).contains("PermaLink")){
BlogInfo bi = new BlogInfo(username, password);
bi.setSDate(matcher.group(5));
if(matcher.find()){
if(matcher.group(1).contains("PostTitle")){
currentUrl = HttpUtil.canonizeURL(domain, baseURI,
matcher.group(3));
try{
URL javaCurrentUrl = new URL(currentUrl);
bi.setSUrl(currentUrl);
bi.setSTitle(matcher.group(5));
}catch(MalformedURLException ex){
//shouldn't reach here, never happened
continue;
}
}
}
if(matcher.find()){
// get all embeddedlinks in that post
while(matcher.group(1).contains("=") == false){
currentUrl = HttpUtil.canonizeURL(domain, baseURI,
matcher.group(3));
try{
URL javaCurrentUrl = new URL(currentUrl);
bi.addMEmbeddedLink(currentUrl);
}catch(MalformedURLException ex){
continue;
}
if(!matcher.find()){
flag = 1;
break;
}
}// while
}// if
// output to file
pw.println(bi.toFile());
// write to file immediately
pw.flush();
// output to stdout
System.out.println(bi);
// no more links, otherwise means find all embeddedlinks
// in one post and reach next post's link
if(flag == 1) break;
}// while
}// while
}
}
BlogLayoutFactory.java
import java.net.MalformedURLException;
// The Abstract Factory
public class BlogLayoutFactory{
public static BlogLayoutInterface getMSDNLayout(String strURL,
String username, String password) throws MalformedURLException {
return new MSDNLayout(strURL, username, password);
}
}
HTMLPage.java
import java.net.*;
public class HTMLPage{
protected String strURL;
protected URL url;
private String username;
private String password;
private BlogLayoutInterface bli;
public HTMLPage(String StrURL, String username, String password)
throws Exception{
this.strURL = StrURL;
url = new URL(StrURL);
bli = BlogLayoutFactory.getMSDNLayout(StrURL, username, password);
this.username = username;
this.password = password;
}
public void run() throws Exception {
String content = HttpUtil.openConnection(url, username, password);
if(content != null){
System.out.println("Start crawling " + url.getHost());
bli.parse(content);
}
}
}
HTMLapp.java
public class HTMLapp{
public static void main(String[] args) throws Exception {
/* Set proxy */
System.setProperty("proxySet", "true");
System.setProperty("http.proxyHost", "www-cache.cs.usyd.edu.au");
System.setProperty("http.proxyPort", "8000");
if(args.length > 2){
HTMLPage myHtml = new HTMLPage(args[2], args[0], args[1]);
myHtml.run();
}else
System.out.println("Usage: java HTMLapp username password"
+ " /"weblog url/"");
}
}
GlobalSite.java
import java.util.*;
import java.net.*;
public class GlobalSite{
private Set NewsPaperSites = new HashSet();
public GlobalSite(){
NewsPaperSites.add("abcnews.go.com");
NewsPaperSites.add("www.cnn.com");
NewsPaperSites.add("www.nytimes.com");
NewsPaperSites.add("www.usatoday.com");
NewsPaperSites.add("www.washingtonpost.com");
NewsPaperSites.add("www.latimes.com");
NewsPaperSites.add("news.ft.com");
NewsPaperSites.add("www.chicagotribune.com");
NewsPaperSites.add("www.sfgate.com");
NewsPaperSites.add("www.csmonitor.com");
NewsPaperSites.add("www.iht.com");
NewsPaperSites.add("www.dallasnews.com");
NewsPaperSites.add("www.newsday.com");
NewsPaperSites.add("www.boston.com");
NewsPaperSites.add("www.chron.com");
NewsPaperSites.add("www.nypost.com");
NewsPaperSites.add("www.startribune.com");
NewsPaperSites.add("www.denverpost.com");
NewsPaperSites.add("www.freep.com");
NewsPaperSites.add("www.signonsandiego.com");
NewsPaperSites.add("www.suntimes.com");
NewsPaperSites.add("www.detnews.com");
NewsPaperSites.add("www.investors.com");
NewsPaperSites.add("www.baltimoresun.com");
NewsPaperSites.add("www.sacbee.com");
NewsPaperSites.add("www.nydailynews.com");
NewsPaperSites.add("www.seattletimes");
NewsPaperSites.add("www.cnet.com");
NewsPaperSites.add("www.zdnet.com");
NewsPaperSites.add("www.techweb.com");
NewsPaperSites.add("www.bbc.co.uk");
}
public boolean contains(String s) throws MalformedURLException {
URL myurl = new URL(s);
return NewsPaperSites.contains(myurl.getHost());
}
}
BlogInfo.java
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class BlogInfo{
private String sTitle;
private String sDate;
private String sUrl;
private Map mEmbeddedLink;
private String username;
private String password;
private static GlobalSite gs = new GlobalSite();
public BlogInfo(String username, String password){
sTitle = "";
sDate = "";
sUrl = "";
this.username = username;
this.password = password;
mEmbeddedLink = new HashMap();
}
public boolean isBlog(String str) throws IOException {
if(str.contains("blog")) return true;
try{
URL myurl = new URL(str);
String content = HttpUtil.openConnection(myurl, username,
password);
Pattern pattern;
int i = 0;
int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL
| Pattern.MULTILINE | Pattern.UNICODE_CASE
| Pattern.CANON_EQ | Pattern.UNIX_LINES;
String regexp = "<h[123456]>(.*?)</h>";
pattern = Pattern.compile(regexp, flags);
Matcher matcher = pattern.matcher(content);
while(matcher.find())
if(matcher.group(1).contains("blog")) i++;
if(i >= 3) return true;
}catch(FileNotFoundException e){
return false;
}catch(Exception e){
System.err.println("Something wrong in: " + str);
}
return false;
}
public boolean isApp(String str) throws IOException {
URL myurl = new URL(str);
HttpURLConnection urlConnection;
try{
urlConnection = HttpUtil.getConnect(myurl, username, password);
String type = urlConnection.getContentType();
if(type != null && type.indexOf("text/") == -1){
return true;
}
}catch(Exception e){
System.err.println("Something wrong in: " + str);
}
return false;
}
public Map getMEmbeddedLink() {
return mEmbeddedLink;
}
public void addMEmbeddedLink(String embeddedLink) throws IOException {
if(gs.contains(embeddedLink)){
mEmbeddedLink.put(embeddedLink, "Major Newspaper");
}else if(isApp(embeddedLink)){
mEmbeddedLink.put(embeddedLink, "application");
}else if(isBlog(embeddedLink)){
mEmbeddedLink.put(embeddedLink, "blog");
}else{
mEmbeddedLink.put(embeddedLink, "others");
}
}
public String getSDate() {
return sDate;
}
public void setSDate(String date) {
sDate = date;
}
public String getSTitle() {
return sTitle;
}
public void setSTitle(String title) {
sTitle = title;
}
public String getSUrl() {
return sUrl;
}
public void setSUrl(String url) {
sUrl = url;
}
public String toString() {
String str = "";
str += "Post " + "/"" + this.sTitle + "/" on /"" + this.sDate
+ "/" has following embedded links:/n" + this.sUrl
+ " blog" + "/n";
Set keyset = this.mEmbeddedLink.keySet();
Iterator itr = keyset.iterator();
while(itr.hasNext()){
String s = (String)itr.next();
str += s + " " + this.mEmbeddedLink.get(s) + "/n";
}
return str;
}
public String toFile() {
String str = "";
Set keyset = this.mEmbeddedLink.keySet();
Iterator itr = keyset.iterator();
while(itr.hasNext()){
String s = (String)itr.next();
str += this.sTitle + " " + this.sDate + " " + this.sUrl + " "
+ s + " " + this.mEmbeddedLink.get(s) + "/n";
}
return str;
}
}
HttpUtil.java
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import HTTPClient.Codecs;
public class HttpUtil{
public static String getBaseUriFromUrl(String url)
throws MalformedURLException {
URL javaURL = new URL(url);
String path = javaURL.getPath();
int index = path.lastIndexOf("/");
if(index == -1){
return "";
}else{
return path.substring(0, index);
}
}
/**
* URLs, in anchors, can come in three flavours:
* <li>Canonical (begining with "http://")
* <li>Absolute, non-canonical (begining with "/")
* <li>Relative (not begining with either "http" or "/")
*
* @param domain
* @param baseUrl
* @param link
* @return
*/
public static String canonizeURL(String domain, String baseUrl,
String link) {
link = link.trim();
String ret = "";
if(link.startsWith("javascript") || link.startsWith("mailto:")){
ret = ""; //Illegal URL
}else if(link.startsWith("http")){
ret = link;
}else if(link.startsWith("www.")){
ret = "http://" + link;
}else if(link.startsWith("/")){
int indx = 0;
if(domain.endsWith("/")){
indx = 1;
}
ret = domain.substring(indx) + link;
}else{
String slash2 = "/";
if(!domain.endsWith("/")) domain = domain + "/";
if(baseUrl.startsWith("/")) baseUrl = baseUrl.substring(1);
if(link.startsWith("/")) link = link.substring(1);
if(baseUrl.equals("")){
slash2 = "";
}
if(baseUrl.endsWith("/")){
slash2 = "";
}
if(link.equals("")){
slash2 = "";
}
ret = domain + baseUrl + slash2 + link;
}
return ret;
}
public static String getDomainFromUrl(String url)
throws MalformedURLException {
URL javaURL = new URL(url);
return javaURL.getProtocol() + "://" + javaURL.getHost();
}
public static boolean rejectRobots(String content) {
Pattern pattern;
final int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL
| Pattern.MULTILINE | Pattern.UNICODE_CASE
| Pattern.CANON_EQ | Pattern.UNIX_LINES;
String regexp = "<meta name=/"robots/" content=/"(.*?)/">";
pattern = Pattern.compile(regexp, flags);
Matcher matcher = pattern.matcher(content);
if(matcher.find()){
if(matcher.group(1).contains("noindex")
|| matcher.group(1).contains("nofollow")){
return true;
}
}
return false;
}
// this method will return a page which link points to in
// form of a String.
public static String openConnection(URL url, String username,
String password) throws Exception {
HttpURLConnection urlConnection = getConnect(url, username,
password);
String type = urlConnection.getContentType();
// check again if the connection is plain html.
// if not known, continue to proceed
// if not plain html, don't do anything
if(type != null && type.indexOf("text/html") == -1){
System.out.println("contenet type is " + type);
return null;
}
//urlConnection.setAllowUserInteraction(false);
String content = "";
try{
InputStream urlStream = urlConnection.getInputStream();
byte b[] = new byte[1000];
int numRead = urlStream.read(b);
if(numRead != -1){
content = new String(b, 0, numRead);
while(numRead != -1){
numRead = urlStream.read(b);
if(numRead != -1){
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
}
urlStream.close();
}catch(IOException e){
System.err.println(e.getMessage());
}
urlConnection.disconnect();
return content;
}
public static HttpURLConnection getConnect(URL url, String username,
String password) throws Exception {
// can only search http: protocol URLs
if(url.getProtocol().compareTo("http") != 0){
throw new Exception("Error: does not support non-http link!");
}
HttpURLConnection urlConnection = (HttpURLConnection)url
.openConnection();
// set proxy authentication
// insert between openConnection and getInputStream
String encoded = new String(Codecs.base64Encode(username + ":"
+ password));
urlConnection.setRequestProperty("Proxy-Authorization", "Basic "
+ encoded);
return urlConnection;
}
}
其中,HTTPClient用来设置代理服务器,如果不需要,可以删掉这部分代码。写的很匆忙,总的来说,这段程序没有经过细致设计,效率很低。它先通过archives访问所有的calendar,再经由calendar访问所有的post,得到这些的links之后,才开始抓post之内的超链接。这样一来,等第一个结果跳出来需要花很长时间,很依赖当时网络的质量。这是最主要的瓶颈。另外,由于是单线程,当检查个别链接的类型时,如果遇到反应时间很长的链接,程序会变的没反应。这是另一个瓶颈。好在这个project不是考察我的算法。嘿嘿,所以当初根本没有考虑这些问题。
p.s. 赌咒不能上传附件的Blog