My Toy No.3：Weblog Crawler

最新推荐文章于 2023-01-13 13:51:03 发布

xxmpp

最新推荐文章于 2023-01-13 13:51:03 发布

阅读量1.7k

点赞数

分类专栏： Java 文章标签： string import exception calendar url domain

本文链接：https://blog.csdn.net/xxmpp/article/details/387358

版权

Java 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

顾名思义，爬虫就是在internet上把我们需要的内容抓回来的程序，google每天都从internet上爬回无数链接来维护它的数据库。这里是一个简单的爬虫，针对MSDN上有名的TheOldNewThing，按post、日期分类，把所有的links抓回来。

BlogLayoutInterface.java

import java.io.*;

// Adapter Interface
public interface BlogLayoutInterface{

    void parse(String content) throws Exception;
    void findEmbeddedLinks(String baseUrl, String content, PrintWriter pw)
            throws Exception;
}

BlogLayout.java

import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;

// Default Adapter
public abstract class BlogLayout implements BlogLayoutInterface{

    protected String strURL;
    protected URL url;
    protected String file;
    protected String username;
    protected String password;

    public void parse(String content) throws Exception {}
    public void findEmbeddedLinks(String baseUrl, String content,
            PrintWriter pw) throws Exception {}

    public BlogLayout(String strURL, String username, String password)
            throws MalformedURLException{
        this.strURL = strURL;
        url = new URL(strURL);
        this.username = username;
        this.password = password;
    }
}

MSDNLayout.java

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MSDNLayout extends BlogLayout{

private static Pattern pattern;

    static{
        final int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL
                | Pattern.MULTILINE | Pattern.UNICODE_CASE
                | Pattern.CANON_EQ | Pattern.UNIX_LINES;

String regexp = "<a(.*?)//shref//s*=//s*([///" + "([///"//'//s].*?>|>)(.*?)</a>";

MSDNLayout.pattern = Pattern.compile(regexp, flags);
}

    public MSDNLayout(String strURL, String username, String password)
            throws MalformedURLException{
        super(strURL, username, password);
    }

    /***************************************************************************
     * this method perform in 3 steps, first get all archive links from homepage
     * (or first page, that is, the original links user input). usually, archive
     * links are organized by time, say, month. so, in second step, get all
     * calendar info for each archive. finally, parse the every calendar links
     * which contains post. for this blog.msdn.com system, one page can only
     * display 20 posts and no "next page" function provided, if there are more
     * than 20 posts in one archive(or month), the rest posts can only be
     * accessed through the calendar. so retrieving all posts by taking
     * advantage of the calendar makes more sense in this case.
     **************************************************************************/
    public void parse(String content) throws Exception {

// this page dont welcome robot:(
if(HttpUtil.rejectRobots(content)) return;

List ArchivesList = new ArrayList();

        Matcher matcher = MSDNLayout.pattern.matcher(content);
        String domain = null;
        try{
            domain = HttpUtil.getDomainFromUrl(strURL);
        }catch(MalformedURLException ex){
            ex.printStackTrace();
        }

        // baseURI is the path component of a URL
        String baseURI = null;
        try{
            baseURI = HttpUtil.getBaseUriFromUrl(strURL);
        }catch(MalformedURLException ex){
            ex.printStackTrace();
        }

String currentUrl;

        // blog defined archives links
        String archiveLinks = "_ctl0__ctl0__ctl0__ctl0_BlogSideBar1__ctl0"
                + "__ctl1__ctl0_Categories_";

        // find all archive links
        while(matcher.find()){
            if(matcher.group(1).contains(archiveLinks)){
                currentUrl = HttpUtil.canonizeURL(domain, baseURI, matcher
                        .group(3));
                try{
                    URL javaCurrentUrl = new URL(currentUrl);
                    ArchivesList.add(currentUrl);
                }catch(MalformedURLException ex){
                    continue;
                }
            }
        }

List CalendarLists = new ArrayList();

        // find all calendar links
        Iterator it_al = ArchivesList.iterator();
        while(it_al.hasNext()){
            // this mycontent contains every archive, which should
            // be used to collect calendar info later.
            String mycontent = HttpUtil.openConnection((new URL(
                    (String)it_al.next())), username, password);
            if(HttpUtil.rejectRobots(mycontent)) continue;
            matcher = MSDNLayout.pattern.matcher(mycontent);

// blog defined calendar links
String calendarLinks = "title=/"";

            while(matcher.find()){
                if(matcher.group(1).contains(calendarLinks)){
                    currentUrl = HttpUtil.canonizeURL(domain, baseURI,
                            matcher.group(3));
                    try{
                        URL javaCurrentUrl = new URL(currentUrl);
                        CalendarLists.add(currentUrl);
                    }catch(MalformedURLException ex){
                        continue;
                    }// try/catch
                }// if
            }// while
        }// while

        // initialize the file
        file = url.getHost() + ".txt";
        PrintWriter pw = new PrintWriter(new BufferedWriter(
                new FileWriter(file)));
        Iterator it_cl = CalendarLists.iterator();
        while(it_cl.hasNext()){
            String mycontent = HttpUtil.openConnection((new URL(
                    (String)it_cl.next())), username, password);
            if(HttpUtil.rejectRobots(mycontent)) continue;
            findEmbeddedLinks(strURL, mycontent, pw);
        }

        pw.close();
        System.out.println("Crawling finished for " + url.getHost());
    }

    public void findEmbeddedLinks(String baseUrl, String content,
            PrintWriter pw) throws IOException {
        if(HttpUtil.rejectRobots(content)) return;
        List list = new ArrayList();

        Matcher matcher = MSDNLayout.pattern.matcher(content);
        String domain = null;
        try{
            domain = HttpUtil.getDomainFromUrl(baseUrl);
        }catch(MalformedURLException ex){
            ex.printStackTrace();
        }

        // baseURI is the path component of a URL
        String baseURI = null;
        try{
            baseURI = HttpUtil.getBaseUriFromUrl(baseUrl);
        }catch(MalformedURLException ex){
            ex.printStackTrace();
        }

String currentUrl = "";
int flag = 0;

        while(matcher.find()){
            while(matcher.group(1).contains("PermaLink")){
                BlogInfo bi = new BlogInfo(username, password);
                bi.setSDate(matcher.group(5));
                if(matcher.find()){
                    if(matcher.group(1).contains("PostTitle")){
                        currentUrl = HttpUtil.canonizeURL(domain, baseURI,
                                matcher.group(3));
                        try{
                            URL javaCurrentUrl = new URL(currentUrl);
                            bi.setSUrl(currentUrl);
                            bi.setSTitle(matcher.group(5));
                        }catch(MalformedURLException ex){
                            //shouldn't reach here, never happened
                            continue;
                        }
                    }
                }
                if(matcher.find()){
                    // get all embeddedlinks in that post
                    while(matcher.group(1).contains("=") == false){
                        currentUrl = HttpUtil.canonizeURL(domain, baseURI,
                                matcher.group(3));
                        try{
                            URL javaCurrentUrl = new URL(currentUrl);
                            bi.addMEmbeddedLink(currentUrl);
                        }catch(MalformedURLException ex){
                            continue;
                        }
                        if(!matcher.find()){
                            flag = 1;
                            break;
                        }
                    }// while
                }// if
                // output to file
                pw.println(bi.toFile());
                // write to file immediately
                pw.flush();
                // output to stdout
                System.out.println(bi);

                // no more links, otherwise means find all embeddedlinks
                // in one post and reach next post's link
                if(flag == 1) break;
            }// while
        }// while
    }
}

BlogLayoutFactory.java

import java.net.MalformedURLException;

// The Abstract Factory
public class BlogLayoutFactory{

    public static BlogLayoutInterface getMSDNLayout(String strURL,
            String username, String password) throws MalformedURLException {
        return new MSDNLayout(strURL, username, password);
    }
}

HTMLPage.java

import java.net.*;

public class HTMLPage{

    protected String strURL;
    protected URL url;
    private String username;
    private String password;
    private BlogLayoutInterface bli;

    public HTMLPage(String StrURL, String username, String password)
            throws Exception{
        this.strURL = StrURL;
        url = new URL(StrURL);
        bli = BlogLayoutFactory.getMSDNLayout(StrURL, username, password);
        this.username = username;
        this.password = password;
    }

    public void run() throws Exception {
        String content = HttpUtil.openConnection(url, username, password);
        if(content != null){
            System.out.println("Start crawling " + url.getHost());
            bli.parse(content);
        }
    }
}

HTMLapp.java

public class HTMLapp{

public static void main(String[] args) throws Exception {

/* Set proxy */

        System.setProperty("proxySet", "true");
        System.setProperty("http.proxyHost", "www-cache.cs.usyd.edu.au");
        System.setProperty("http.proxyPort", "8000");

        if(args.length > 2){
            HTMLPage myHtml = new HTMLPage(args[2], args[0], args[1]);
            myHtml.run();
        }else
            System.out.println("Usage: java HTMLapp username password"
                    + " /"weblog url/"");
    }
}

GlobalSite.java

import java.util.*;
import java.net.*;

public class GlobalSite{

    private Set NewsPaperSites = new HashSet();
    public GlobalSite(){
        NewsPaperSites.add("abcnews.go.com");
        NewsPaperSites.add("www.cnn.com");
        NewsPaperSites.add("www.nytimes.com");
        NewsPaperSites.add("www.usatoday.com");
        NewsPaperSites.add("www.washingtonpost.com");
        NewsPaperSites.add("www.latimes.com");
        NewsPaperSites.add("news.ft.com");
        NewsPaperSites.add("www.chicagotribune.com");
        NewsPaperSites.add("www.sfgate.com");
        NewsPaperSites.add("www.csmonitor.com");
        NewsPaperSites.add("www.iht.com");
        NewsPaperSites.add("www.dallasnews.com");
        NewsPaperSites.add("www.newsday.com");
        NewsPaperSites.add("www.boston.com");
        NewsPaperSites.add("www.chron.com");
        NewsPaperSites.add("www.nypost.com");
        NewsPaperSites.add("www.startribune.com");
        NewsPaperSites.add("www.denverpost.com");
        NewsPaperSites.add("www.freep.com");
        NewsPaperSites.add("www.signonsandiego.com");
        NewsPaperSites.add("www.suntimes.com");
        NewsPaperSites.add("www.detnews.com");
        NewsPaperSites.add("www.investors.com");
        NewsPaperSites.add("www.baltimoresun.com");
        NewsPaperSites.add("www.sacbee.com");
        NewsPaperSites.add("www.nydailynews.com");
        NewsPaperSites.add("www.seattletimes");
        NewsPaperSites.add("www.cnet.com");
        NewsPaperSites.add("www.zdnet.com");
        NewsPaperSites.add("www.techweb.com");
        NewsPaperSites.add("www.bbc.co.uk");
    }

    public boolean contains(String s) throws MalformedURLException {
        URL myurl = new URL(s);
        return NewsPaperSites.contains(myurl.getHost());
    }
}

BlogInfo.java

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class BlogInfo{

    private String sTitle;
    private String sDate;
    private String sUrl;
    private Map mEmbeddedLink;
    private String username;
    private String password;
    private static GlobalSite gs = new GlobalSite();

    public BlogInfo(String username, String password){
        sTitle = "";
        sDate = "";
        sUrl = "";
        this.username = username;
        this.password = password;
        mEmbeddedLink = new HashMap();
    }

    public boolean isBlog(String str) throws IOException {
        if(str.contains("blog")) return true;
        try{
            URL myurl = new URL(str);
            String content = HttpUtil.openConnection(myurl, username,
                    password);

Pattern pattern;
int i = 0;

String regexp = "<h[123456]>(.*?)</h>";

pattern = Pattern.compile(regexp, flags);
Matcher matcher = pattern.matcher(content);

while(matcher.find())
if(matcher.group(1).contains("blog")) i++;

            if(i >= 3) return true;
        }catch(FileNotFoundException e){
            return false;
        }catch(Exception e){
            System.err.println("Something wrong in: " + str);
        }
        return false;
    }

    public boolean isApp(String str) throws IOException {
        URL myurl = new URL(str);
        HttpURLConnection urlConnection;
        try{
            urlConnection = HttpUtil.getConnect(myurl, username, password);
            String type = urlConnection.getContentType();
            if(type != null && type.indexOf("text/") == -1){
                return true;
            }
        }catch(Exception e){
            System.err.println("Something wrong in: " + str);
        }
        return false;
    }

    public Map getMEmbeddedLink() {
        return mEmbeddedLink;
    }
    public void addMEmbeddedLink(String embeddedLink) throws IOException {
        if(gs.contains(embeddedLink)){
            mEmbeddedLink.put(embeddedLink, "Major Newspaper");
        }else if(isApp(embeddedLink)){
            mEmbeddedLink.put(embeddedLink, "application");
        }else if(isBlog(embeddedLink)){
            mEmbeddedLink.put(embeddedLink, "blog");
        }else{
            mEmbeddedLink.put(embeddedLink, "others");
        }
    }
    public String getSDate() {
        return sDate;
    }
    public void setSDate(String date) {
        sDate = date;
    }
    public String getSTitle() {
        return sTitle;
    }
    public void setSTitle(String title) {
        sTitle = title;
    }
    public String getSUrl() {
        return sUrl;
    }
    public void setSUrl(String url) {
        sUrl = url;
    }

    public String toString() {
        String str = "";
        str += "Post " + "/"" + this.sTitle + "/" on /"" + this.sDate
                + "/" has following embedded links:/n" + this.sUrl
                + " blog" + "/n";

Set keyset = this.mEmbeddedLink.keySet();
Iterator itr = keyset.iterator();

        while(itr.hasNext()){
            String s = (String)itr.next();
            str += s + " " + this.mEmbeddedLink.get(s) + "/n";
        }
        return str;
    }

    public String toFile() {
        String str = "";
        Set keyset = this.mEmbeddedLink.keySet();
        Iterator itr = keyset.iterator();

        while(itr.hasNext()){
            String s = (String)itr.next();
            str += this.sTitle + " " + this.sDate + " " + this.sUrl + " "
                    + s + " " + this.mEmbeddedLink.get(s) + "/n";
        }
        return str;
    }
}

HttpUtil.java

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import HTTPClient.Codecs;

public class HttpUtil{

    public static String getBaseUriFromUrl(String url)
            throws MalformedURLException {
        URL javaURL = new URL(url);
        String path = javaURL.getPath();
        int index = path.lastIndexOf("/");
        if(index == -1){
            return "";
        }else{
            return path.substring(0, index);
        }
    }

    /**
     * URLs, in anchors, can come in three flavours:
     * <li>Canonical (begining with "http://")
     * <li>Absolute, non-canonical (begining with "/")
     * <li>Relative (not begining with either "http" or "/")
     *
     * @param domain
     * @param baseUrl
     * @param link
     * @return
     */
    public static String canonizeURL(String domain, String baseUrl,
            String link) {
        link = link.trim();
        String ret = "";

        if(link.startsWith("javascript") || link.startsWith("mailto:")){
            ret = ""; //Illegal URL
        }else if(link.startsWith("http")){
            ret = link;
        }else if(link.startsWith("www.")){
            ret = "http://" + link;
        }else if(link.startsWith("/")){
            int indx = 0;
            if(domain.endsWith("/")){
                indx = 1;
            }
            ret = domain.substring(indx) + link;
        }else{
            String slash2 = "/";

            if(!domain.endsWith("/")) domain = domain + "/";
            if(baseUrl.startsWith("/")) baseUrl = baseUrl.substring(1);
            if(link.startsWith("/")) link = link.substring(1);
            if(baseUrl.equals("")){
                slash2 = "";
            }
            if(baseUrl.endsWith("/")){
                slash2 = "";
            }
            if(link.equals("")){
                slash2 = "";
            }
            ret = domain + baseUrl + slash2 + link;
        }
        return ret;
    }

    public static String getDomainFromUrl(String url)
            throws MalformedURLException {
        URL javaURL = new URL(url);
        return javaURL.getProtocol() + "://" + javaURL.getHost();
    }

public static boolean rejectRobots(String content) {
Pattern pattern;

String regexp = "<meta name=/"robots/" content=/"(.*?)/">";

        pattern = Pattern.compile(regexp, flags);
        Matcher matcher = pattern.matcher(content);
        if(matcher.find()){
            if(matcher.group(1).contains("noindex")
                    || matcher.group(1).contains("nofollow")){
                return true;
            }
        }
        return false;
    }

    // this method will return a page which link points to in
    // form of a String.
    public static String openConnection(URL url, String username,
            String password) throws Exception {

HttpURLConnection urlConnection = getConnect(url, username,
password);

String type = urlConnection.getContentType();

        // check again if the connection is plain html.
        // if not known, continue to proceed
        // if not plain html, don't do anything
        if(type != null && type.indexOf("text/html") == -1){
            System.out.println("contenet type is " + type);
            return null;
        }

        //urlConnection.setAllowUserInteraction(false);
        String content = "";
        try{
            InputStream urlStream = urlConnection.getInputStream();

byte b[] = new byte[1000];
int numRead = urlStream.read(b);

            if(numRead != -1){
                content = new String(b, 0, numRead);
                while(numRead != -1){
                    numRead = urlStream.read(b);
                    if(numRead != -1){
                        String newContent = new String(b, 0, numRead);
                        content += newContent;
                    }
                }
            }
            urlStream.close();
        }catch(IOException e){
            System.err.println(e.getMessage());
        }
        urlConnection.disconnect();
        return content;
    }

public static HttpURLConnection getConnect(URL url, String username,
String password) throws Exception {

        // can only search http: protocol URLs
        if(url.getProtocol().compareTo("http") != 0){
            throw new Exception("Error: does not support non-http link!");
        }

HttpURLConnection urlConnection = (HttpURLConnection)url
.openConnection();

        // set proxy authentication
        // insert between openConnection and getInputStream
        String encoded = new String(Codecs.base64Encode(username + ":"
                + password));
        urlConnection.setRequestProperty("Proxy-Authorization", "Basic "
                + encoded);

return urlConnection;
}
}

其中，HTTPClient用来设置代理服务器，如果不需要，可以删掉这部分代码。写的很匆忙，总的来说，这段程序没有经过细致设计，效率很低。它先通过archives访问所有的calendar，再经由calendar访问所有的post，得到这些的links之后，才开始抓post之内的超链接。这样一来，等第一个结果跳出来需要花很长时间，很依赖当时网络的质量。这是最主要的瓶颈。另外，由于是单线程，当检查个别链接的类型时，如果遇到反应时间很长的链接，程序会变的没反应。这是另一个瓶颈。好在这个project不是考察我的算法。嘿嘿，所以当初根本没有考虑这些问题。

p.s. 赌咒不能上传附件的Blog

xxmpp

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
My Toy No.3：Weblog Crawler

顾名思义，爬虫就是在internet上把我们需要的内容抓回来的程序，google每天都从internet上爬回无数链接来维护它的数据库。这里是一个简单的爬虫，针对MSDN上有名的TheOldNewThing，按post、日期分类，把所有的links抓回来。BlogLayoutInterface.javaimport java.io.*;// Adapter Interfacepublic inte
复制链接

扫一扫