My Toy No.3:Weblog Crawler

顾名思义,爬虫就是在internet上把我们需要的内容抓回来的程序,google每天都从internet上爬回无数链接来维护它的数据库。这里是一个简单的爬虫,针对MSDN上有名的TheOldNewThing,按post、日期分类,把所有的links抓回来。

BlogLayoutInterface.java

import java.io.*;

// Adapter Interface
public interface BlogLayoutInterface{

    void parse(String content) throws Exception;
    void findEmbeddedLinks(String baseUrl, String content, PrintWriter pw)
            throws Exception;
}

BlogLayout.java

import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;

// Default Adapter
public abstract class BlogLayout implements BlogLayoutInterface{

    protected String strURL;
    protected URL url;
    protected String file;
    protected String username;
    protected String password;

    public void parse(String content) throws Exception {}
    public void findEmbeddedLinks(String baseUrl, String content,
            PrintWriter pw) throws Exception {}

    public BlogLayout(String strURL, String username, String password)
            throws MalformedURLException{
        this.strURL = strURL;
        url = new URL(strURL);
        this.username = username;
        this.password = password;
    }
}

MSDNLayout.java

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MSDNLayout extends BlogLayout{

    private static Pattern pattern;

    static{
        final int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL
                | Pattern.MULTILINE | Pattern.UNICODE_CASE
                | Pattern.CANON_EQ | Pattern.UNIX_LINES;

        String regexp = "<a(.*?)//shref//s*=//s*([///"                + "([///"//'//s].*?>|>)(.*?)</a>";

        MSDNLayout.pattern = Pattern.compile(regexp, flags);
    }

    public MSDNLayout(String strURL, String username, String password)
            throws MalformedURLException{
        super(strURL, username, password);
    }

    /***************************************************************************
     * this method perform in 3 steps, first get all archive links from homepage
     * (or first page, that is, the original links user input). usually, archive
     * links are organized by time, say, month. so, in second step, get all
     * calendar info for each archive. finally, parse the every calendar links
     * which contains post. for this blog.msdn.com system, one page can only
     * display 20 posts and no "next page" function provided, if there are more
     * than 20 posts in one archive(or month), the rest posts can only be
     * accessed through the calendar. so retrieving all posts by taking
     * advantage of the calendar makes more sense in this case.
     **************************************************************************/
    public void parse(String content) throws Exception {

        // this page dont welcome robot:(
        if(HttpUtil.rejectRobots(content)) return;

        List ArchivesList = new ArrayList();

        Matcher matcher = MSDNLayout.pattern.matcher(content);
        String domain = null;
        try{
            domain = HttpUtil.getDomainFromUrl(strURL);
        }catch(MalformedURLException ex){
            ex.printStackTrace();
        }

        // baseURI is the path component of a URL
        String baseURI = null;
        try{
            baseURI = HttpUtil.getBaseUriFromUrl(strURL);
        }catch(MalformedURLException ex){
            ex.printStackTrace();
        }

        String currentUrl;

        // blog defined archives links
        String archiveLinks = "_ctl0__ctl0__ctl0__ctl0_BlogSideBar1__ctl0"
                + "__ctl1__ctl0_Categories_";

        // find all archive links
        while(matcher.find()){
            if(matcher.group(1).contains(archiveLinks)){
                currentUrl = HttpUtil.canonizeURL(domain, baseURI, matcher
                        .group(3));
                try{
                    URL javaCurrentUrl = new URL(currentUrl);
                    ArchivesList.add(currentUrl);
                }catch(MalformedURLException ex){
                    continue;
                }
            }
        }

        List CalendarLists = new ArrayList();

        // find all calendar links
        Iterator it_al = ArchivesList.iterator();
        while(it_al.hasNext()){
            // this mycontent contains every archive, which should
            // be used to collect calendar info later.
            String mycontent = HttpUtil.openConnection((new URL(
                    (String)it_al.next())), username, password);
            if(HttpUtil.rejectRobots(mycontent)) continue;
            matcher = MSDNLayout.pattern.matcher(mycontent);

            // blog defined calendar links
            String calendarLinks = "title=/"";

            while(matcher.find()){
                if(matcher.group(1).contains(calendarLinks)){
                    currentUrl = HttpUtil.canonizeURL(domain, baseURI,
                            matcher.group(3));
                    try{
                        URL javaCurrentUrl = new URL(currentUrl);
                        CalendarLists.add(currentUrl);
                    }catch(MalformedURLException ex){
                        continue;
                    }// try/catch
                }// if
            }// while
        }// while

        // initialize the file
        file = url.getHost() + ".txt";
        PrintWriter pw = new PrintWriter(new BufferedWriter(
                new FileWriter(file)));
        Iterator it_cl = CalendarLists.iterator();
        while(it_cl.hasNext()){
            String mycontent = HttpUtil.openConnection((new URL(
                    (String)it_cl.next())), username, password);
            if(HttpUtil.rejectRobots(mycontent)) continue;
            findEmbeddedLinks(strURL, mycontent, pw);
        }

        pw.close();
        System.out.println("Crawling finished for " + url.getHost());
    }

    public void findEmbeddedLinks(String baseUrl, String content,
            PrintWriter pw) throws IOException {
        if(HttpUtil.rejectRobots(content)) return;
        List list = new ArrayList();

        Matcher matcher = MSDNLayout.pattern.matcher(content);
        String domain = null;
        try{
            domain = HttpUtil.getDomainFromUrl(baseUrl);
        }catch(MalformedURLException ex){
            ex.printStackTrace();
        }

        // baseURI is the path component of a URL
        String baseURI = null;
        try{
            baseURI = HttpUtil.getBaseUriFromUrl(baseUrl);
        }catch(MalformedURLException ex){
            ex.printStackTrace();
        }

        String currentUrl = "";
        int flag = 0;

        while(matcher.find()){
            while(matcher.group(1).contains("PermaLink")){
                BlogInfo bi = new BlogInfo(username, password);
                bi.setSDate(matcher.group(5));
                if(matcher.find()){
                    if(matcher.group(1).contains("PostTitle")){
                        currentUrl = HttpUtil.canonizeURL(domain, baseURI,
                                matcher.group(3));
                        try{
                            URL javaCurrentUrl = new URL(currentUrl);
                            bi.setSUrl(currentUrl);
                            bi.setSTitle(matcher.group(5));
                        }catch(MalformedURLException ex){
                            //shouldn't reach here, never happened
                            continue;
                        }
                    }
                }
                if(matcher.find()){
                    // get all embeddedlinks in that post
                    while(matcher.group(1).contains("=") == false){
                        currentUrl = HttpUtil.canonizeURL(domain, baseURI,
                                matcher.group(3));
                        try{
                            URL javaCurrentUrl = new URL(currentUrl);
                            bi.addMEmbeddedLink(currentUrl);
                        }catch(MalformedURLException ex){
                            continue;
                        }
                        if(!matcher.find()){
                            flag = 1;
                            break;
                        }
                    }// while
                }// if
                // output to file
                pw.println(bi.toFile());
                // write to file immediately
                pw.flush();
                // output to stdout
                System.out.println(bi);

                // no more links, otherwise means find all embeddedlinks
                // in one post and reach next post's link
                if(flag == 1) break;
            }// while
        }// while
    }
}

BlogLayoutFactory.java

import java.net.MalformedURLException;

// The Abstract Factory
public class BlogLayoutFactory{

    public static BlogLayoutInterface getMSDNLayout(String strURL,
            String username, String password) throws MalformedURLException {
        return new MSDNLayout(strURL, username, password);
    }
}

HTMLPage.java

import java.net.*;

public class HTMLPage{

    protected String strURL;
    protected URL url;
    private String username;
    private String password;
    private BlogLayoutInterface bli;

    public HTMLPage(String StrURL, String username, String password)
            throws Exception{
        this.strURL = StrURL;
        url = new URL(StrURL);
        bli = BlogLayoutFactory.getMSDNLayout(StrURL, username, password);
        this.username = username;
        this.password = password;
    }

    public void run() throws Exception {
        String content = HttpUtil.openConnection(url, username, password);
        if(content != null){
            System.out.println("Start crawling " + url.getHost());
            bli.parse(content);
        }
    }
}

HTMLapp.java

public class HTMLapp{

    public static void main(String[] args) throws Exception {

        /* Set proxy */

        System.setProperty("proxySet", "true");
        System.setProperty("http.proxyHost", "www-cache.cs.usyd.edu.au");
        System.setProperty("http.proxyPort", "8000");

        if(args.length > 2){
            HTMLPage myHtml = new HTMLPage(args[2], args[0], args[1]);
            myHtml.run();
        }else
            System.out.println("Usage: java HTMLapp username password"
                    + " /"weblog url/"");
    }
}


GlobalSite.java

import java.util.*;
import java.net.*;

public class GlobalSite{

    private Set NewsPaperSites = new HashSet();
    public GlobalSite(){
        NewsPaperSites.add("abcnews.go.com");
        NewsPaperSites.add("www.cnn.com");
        NewsPaperSites.add("www.nytimes.com");
        NewsPaperSites.add("www.usatoday.com");
        NewsPaperSites.add("www.washingtonpost.com");
        NewsPaperSites.add("www.latimes.com");
        NewsPaperSites.add("news.ft.com");
        NewsPaperSites.add("www.chicagotribune.com");
        NewsPaperSites.add("www.sfgate.com");
        NewsPaperSites.add("www.csmonitor.com");
        NewsPaperSites.add("www.iht.com");
        NewsPaperSites.add("www.dallasnews.com");
        NewsPaperSites.add("www.newsday.com");
        NewsPaperSites.add("www.boston.com");
        NewsPaperSites.add("www.chron.com");
        NewsPaperSites.add("www.nypost.com");
        NewsPaperSites.add("www.startribune.com");
        NewsPaperSites.add("www.denverpost.com");
        NewsPaperSites.add("www.freep.com");
        NewsPaperSites.add("www.signonsandiego.com");
        NewsPaperSites.add("www.suntimes.com");
        NewsPaperSites.add("www.detnews.com");
        NewsPaperSites.add("www.investors.com");
        NewsPaperSites.add("www.baltimoresun.com");
        NewsPaperSites.add("www.sacbee.com");
        NewsPaperSites.add("www.nydailynews.com");
        NewsPaperSites.add("www.seattletimes");
        NewsPaperSites.add("www.cnet.com");
        NewsPaperSites.add("www.zdnet.com");
        NewsPaperSites.add("www.techweb.com");
        NewsPaperSites.add("www.bbc.co.uk");
    }

    public boolean contains(String s) throws MalformedURLException {
        URL myurl = new URL(s);
        return NewsPaperSites.contains(myurl.getHost());
    }
}

BlogInfo.java

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class BlogInfo{

    private String sTitle;
    private String sDate;
    private String sUrl;
    private Map mEmbeddedLink;
    private String username;
    private String password;
    private static GlobalSite gs = new GlobalSite();

    public BlogInfo(String username, String password){
        sTitle = "";
        sDate = "";
        sUrl = "";
        this.username = username;
        this.password = password;
        mEmbeddedLink = new HashMap();
    }

    public boolean isBlog(String str) throws IOException {
        if(str.contains("blog")) return true;
        try{
            URL myurl = new URL(str);
            String content = HttpUtil.openConnection(myurl, username,
                    password);

            Pattern pattern;
            int i = 0;

            int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL
                    | Pattern.MULTILINE | Pattern.UNICODE_CASE
                    | Pattern.CANON_EQ | Pattern.UNIX_LINES;

            String regexp = "<h[123456]>(.*?)</h>";

            pattern = Pattern.compile(regexp, flags);
            Matcher matcher = pattern.matcher(content);

            while(matcher.find())
                if(matcher.group(1).contains("blog")) i++;

            if(i >= 3) return true;
        }catch(FileNotFoundException e){
            return false;
        }catch(Exception e){
            System.err.println("Something wrong in: " + str);
        }
        return false;
    }

    public boolean isApp(String str) throws IOException {
        URL myurl = new URL(str);
        HttpURLConnection urlConnection;
        try{
            urlConnection = HttpUtil.getConnect(myurl, username, password);
            String type = urlConnection.getContentType();
            if(type != null && type.indexOf("text/") == -1){
                return true;
            }
        }catch(Exception e){
            System.err.println("Something wrong in: " + str);
        }
        return false;
    }

    public Map getMEmbeddedLink() {
        return mEmbeddedLink;
    }
    public void addMEmbeddedLink(String embeddedLink) throws IOException {
        if(gs.contains(embeddedLink)){
            mEmbeddedLink.put(embeddedLink, "Major Newspaper");
        }else if(isApp(embeddedLink)){
            mEmbeddedLink.put(embeddedLink, "application");
        }else if(isBlog(embeddedLink)){
            mEmbeddedLink.put(embeddedLink, "blog");
        }else{
            mEmbeddedLink.put(embeddedLink, "others");
        }
    }
    public String getSDate() {
        return sDate;
    }
    public void setSDate(String date) {
        sDate = date;
    }
    public String getSTitle() {
        return sTitle;
    }
    public void setSTitle(String title) {
        sTitle = title;
    }
    public String getSUrl() {
        return sUrl;
    }
    public void setSUrl(String url) {
        sUrl = url;
    }

    public String toString() {
        String str = "";
        str += "Post " + "/"" + this.sTitle + "/" on /"" + this.sDate
                + "/" has following embedded links:/n" + this.sUrl
                + " blog" + "/n";

        Set keyset = this.mEmbeddedLink.keySet();
        Iterator itr = keyset.iterator();

        while(itr.hasNext()){
            String s = (String)itr.next();
            str += s + " " + this.mEmbeddedLink.get(s) + "/n";
        }
        return str;
    }

    public String toFile() {
        String str = "";
        Set keyset = this.mEmbeddedLink.keySet();
        Iterator itr = keyset.iterator();

        while(itr.hasNext()){
            String s = (String)itr.next();
            str += this.sTitle + " " + this.sDate + " " + this.sUrl + " "
                    + s + " " + this.mEmbeddedLink.get(s) + "/n";
        }
        return str;
    }
}

HttpUtil.java

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import HTTPClient.Codecs;

public class HttpUtil{

    public static String getBaseUriFromUrl(String url)
            throws MalformedURLException {
        URL javaURL = new URL(url);
        String path = javaURL.getPath();
        int index = path.lastIndexOf("/");
        if(index == -1){
            return "";
        }else{
            return path.substring(0, index);
        }
    }

    /**
     * URLs, in anchors, can come in three flavours:
     * <li>Canonical (begining with "http://")
     * <li>Absolute, non-canonical (begining with "/")
     * <li>Relative (not begining with either "http" or "/")
     *
     * @param domain
     * @param baseUrl
     * @param link
     * @return
     */
    public static String canonizeURL(String domain, String baseUrl,
            String link) {
        link = link.trim();
        String ret = "";

        if(link.startsWith("javascript") || link.startsWith("mailto:")){
            ret = ""; //Illegal URL
        }else if(link.startsWith("http")){
            ret = link;
        }else if(link.startsWith("www.")){
            ret = "http://" + link;
        }else if(link.startsWith("/")){
            int indx = 0;
            if(domain.endsWith("/")){
                indx = 1;
            }
            ret = domain.substring(indx) + link;
        }else{
            String slash2 = "/";

            if(!domain.endsWith("/")) domain = domain + "/";
            if(baseUrl.startsWith("/")) baseUrl = baseUrl.substring(1);
            if(link.startsWith("/")) link = link.substring(1);
            if(baseUrl.equals("")){
                slash2 = "";
            }
            if(baseUrl.endsWith("/")){
                slash2 = "";
            }
            if(link.equals("")){
                slash2 = "";
            }
            ret = domain + baseUrl + slash2 + link;
        }
        return ret;
    }

    public static String getDomainFromUrl(String url)
            throws MalformedURLException {
        URL javaURL = new URL(url);
        return javaURL.getProtocol() + "://" + javaURL.getHost();
    }

    public static boolean rejectRobots(String content) {
        Pattern pattern;

        final int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL
                | Pattern.MULTILINE | Pattern.UNICODE_CASE
                | Pattern.CANON_EQ | Pattern.UNIX_LINES;

        String regexp = "<meta name=/"robots/" content=/"(.*?)/">";

        pattern = Pattern.compile(regexp, flags);
        Matcher matcher = pattern.matcher(content);
        if(matcher.find()){
            if(matcher.group(1).contains("noindex")
                    || matcher.group(1).contains("nofollow")){
                return true;
            }
        }
        return false;
    }

    // this method will return a page which link points to in
    // form of a String.
    public static String openConnection(URL url, String username,
            String password) throws Exception {

        HttpURLConnection urlConnection = getConnect(url, username,
                password);

        String type = urlConnection.getContentType();

        // check again if the connection is plain html.
        // if not known, continue to proceed
        // if not plain html, don't do anything
        if(type != null && type.indexOf("text/html") == -1){
            System.out.println("contenet type is " + type);
            return null;
        }

        //urlConnection.setAllowUserInteraction(false);
        String content = "";
        try{
            InputStream urlStream = urlConnection.getInputStream();

            byte b[] = new byte[1000];
            int numRead = urlStream.read(b);

            if(numRead != -1){
                content = new String(b, 0, numRead);
                while(numRead != -1){
                    numRead = urlStream.read(b);
                    if(numRead != -1){
                        String newContent = new String(b, 0, numRead);
                        content += newContent;
                    }
                }
            }
            urlStream.close();
        }catch(IOException e){
            System.err.println(e.getMessage());
        }
        urlConnection.disconnect();
        return content;
    }

    public static HttpURLConnection getConnect(URL url, String username,
            String password) throws Exception {

        // can only search http: protocol URLs
        if(url.getProtocol().compareTo("http") != 0){
            throw new Exception("Error: does not support non-http link!");
        }

        HttpURLConnection urlConnection = (HttpURLConnection)url
                .openConnection();

        // set proxy authentication
        // insert between openConnection and getInputStream
        String encoded = new String(Codecs.base64Encode(username + ":"
                + password));
        urlConnection.setRequestProperty("Proxy-Authorization", "Basic "
                + encoded);

        return urlConnection;
    }
}


其中,HTTPClient用来设置代理服务器,如果不需要,可以删掉这部分代码。写的很匆忙,总的来说,这段程序没有经过细致设计,效率很低。它先通过archives访问所有的calendar,再经由calendar访问所有的post,得到这些的links之后,才开始抓post之内的超链接。这样一来,等第一个结果跳出来需要花很长时间,很依赖当时网络的质量。这是最主要的瓶颈。另外,由于是单线程,当检查个别链接的类型时,如果遇到反应时间很长的链接,程序会变的没反应。这是另一个瓶颈。好在这个project不是考察我的算法。嘿嘿,所以当初根本没有考虑这些问题。

p.s. 赌咒不能上传附件的Blog

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
### 回答1: sogouq.reduced weblog是搜狗搜索在2016年推出的一款基于用户搜索行为数据的网站分析工具。该工具可以对网站流量、用户行为、竞争分析等方面进行深入分析,帮助网站管理者进行数据驱动的决策,提高网站的用户体验和营销效益。 该工具提供了多个维度的数据分析功能,包括来源渠道、访问路径、访问时长、热门页面、搜索关键词等。用户只需在工具中输入要分析的网站地址,即可获得详尽的数据分析报告,比如该网站的月度流量变化、访问页面的热度排行、用户的地域分布和兴趣偏好等等。 在竞争分析方面,该工具可以帮用户挖掘竞争对手的流量来源、优势关键词、用户行为等信息,帮助用户针对性地优化网站的内容和推广策略。 总的来说,sogouq.reduced weblog是一款十分实用的网站分析工具,可以帮助企业和个人了解自己网站的情况,及时发现问题和优化方向,提升网站的竞争力和用户体验,同时也对互联网营销和数据分析领域的发展起到了重要的推动作用。 ### 回答2: Sogouq.reduced Weblog是一种网络日志,是搜狗公司推出的一项服务。该服务可以帮助用户创建和管理自己的博客,同时还能提供一些额外的功能,如编辑器、主题库、插件等。 通过Sogouq.reduced Weblog,用户可以分享自己的思想、心情、经验或任何其他内容。可以随时随地更新和发布文章,让更多的人关注自己的想法。同时,也可以与其他博客主交互,分享经验,互相学习进步。 Sogouq.reduced Weblog提供了许多现代化的功能,使得博客的操作变得更加简单和便捷。例如,可以定制自己的主题,从而让自己的博客变得更加个性化;还可以添加各种插件,从而扩展博客的功能性。这一切都让用户能够更加专注于创作内容,而不是困在技术细节的泥潭中。 总的来说,Sogouq.reduced Weblog是一个非常实用的网络服务,它可以帮助用户实现自己的博客梦想,同时还能提供各种便捷的功能,让用户可以专注于内容创作。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值