2月10号给dr. wang 的回信

最新推荐文章于 2019-08-03 10:13:00 发布

Leonkaka

最新推荐文章于 2019-08-03 10:13:00 发布

阅读量931

点赞数

分类专栏：毕业设计面面观文章标签： string null stream 数据结构存储 input

本文链接：https://blog.csdn.net/Leonkaka/article/details/2087977

版权

毕业设计面面观专栏收录该内容

11 篇文章 0 订阅

订阅专栏

王老师您好，

你给我的那个爬万方数据的程序有错误，我发现那个程序用的原型是minicrawler，于是我又把你先前发来的这个程序研究了一下，后来发现还是有错误（似乎被人做了手脚）。错误大致集中于寻找链接这一函数中，我上网找到了原版的minicrawler程序下载下来研究了一下，终于明白findlink是怎样工作的，并且关键的parse操作是由一个叫做IndexOf的方法实现，这一方法拥有多种重载类型（本程序中为+8重载）。

搞明白程序的工作原理后，我又按照你在wanfang那个程序里面的要求增加了队列，用来存储下载下来网页中的links。下载的页面存在down文件夹下。

这个程序的原理和完成的功能是：

首先下载一个页面，将该页面中所有链接存储到一个队列中，然后依次将链接出队并重复前述操作，即从队列中取出一个网址，程序将爬下来n多网址并入队，因此，这是一种类似于数据结构中广度优先搜索的爬虫程序。

不过我发现爬下来的网页，基本上都是文字内容，图片大都丢失了，但也能存下来一些，不知道为什么？

附件中提交的程序如下：

using System;
using System.Collections;
using System.Text;
using System.Net;
using System.IO;

namespace crawler
{
    class Program {

// Find a link in a content string.
static string FindLink(string htmlstr, ref int startloc) {
    int i;
    int start, end;
    string uri = null;
    string lowcasestr = htmlstr.ToLower();

    i = lowcasestr.IndexOf("href=/"http", startloc);
    if(i != -1) {
      start = htmlstr.IndexOf("/"", i) + 1;
        //搜索前引号，引号的表示："/""（双引号中用转义字符表示）或者'"'（单引号中直接写双引号）
      end = htmlstr.IndexOf('"', start); //搜索后引号
      uri = htmlstr.Substring(start, end - start);
      startloc = end;
    }

    return uri;
}

public static void Main(string[] args) {
    string link = null;
    string str;
    //string answer;

    int curloc; // holds current location in response
    int links = 0;//记录link总数
    // if(args.Length != 1) {
    // Console.WriteLine("Usage: MiniCrawler <uri>");
    // return ;
    //}

string uri = " http://www.wanfangdata.com.cn/Search/ResourceBrowse.aspx?by=4&CurrentMode=%e6%9c%9f%e5%88%8a%e5%88%86%e7%b1%bb%7croot.T%2f%e6%9c%9f%e5%88%8a%e5%88%86%e7%b1%bb%7croot.T.TP%2fqikanquanwen%7crjxb"; // holds current URI

    //settings
    string downloadPath = @"E:/毕设/test/crawler/crawler/down/";
    string downloadFile = downloadPath;
    Queue htmls = new Queue();//申请一个队列用于存储网址
    try {

      do {
        Console.WriteLine("Linking to " + uri);

        /* Create a WebRequest to the specified URI. */
        HttpWebRequest req = (HttpWebRequest)
               WebRequest.Create(uri);

        uri = null; // disallow further use of this URI

        // Send that request and return the response.
        HttpWebResponse resp = (HttpWebResponse)
               req.GetResponse();

        // From the response, obtain an input stream.
        Stream istrm = resp.GetResponseStream();

        // Wrap the input stream in a StreamReader.
        StreamReader rdr = new StreamReader(istrm, System.Text.Encoding.Default);

        // Read in the entire page.
        str = rdr.ReadToEnd();
        downloadFile = downloadPath + links + ".html";
        links++;
        FileStream fs = new FileStream(downloadFile, FileMode.CreateNew, FileAccess.Write, FileShare.None);
        StreamWriter sw = new StreamWriter(fs, System.Text.Encoding.Default);
        sw.Write(str);

        curloc = 0; //从下载下来的页面的第一个位置开始寻找匹配字符串

          do {
          // Find the next URI to link to.
          link = FindLink(str, ref curloc);
          if(link != null) {
            Console.WriteLine("Link found: " + link);
            htmls.Enqueue(link);
          // Console.Write("Link, More, Quit?");
          // answer = Console.ReadLine();

          // if(string.Compare(answer, "L", true) == 0) {
          //    uristr = string.Copy(link);
          //    break;
          // } else if(string.Compare(answer, "Q", true) == 0) {
          //    break;
          // } else if(string.Compare(answer, "M", true) == 0) {
          //    Console.WriteLine("Searching for another link.");
          // }
      }
      else
      {
          Console.WriteLine("No link found.");
          break;
      }

        } while(link.Length > 0);

        // Close the Response.
        resp.Close();

        uri = htmls.Dequeue().ToString();//将uri出队并下载该页面
      } while(uri != null);

    } catch(WebException exc) {
      Console.WriteLine("Network Error: " + exc.Message +
                        "/nStatus code: " + exc.Status);
    } catch(ProtocolViolationException exc) {
      Console.WriteLine("Protocol Error: " + exc.Message);
    } catch(UriFormatException exc) {
      Console.WriteLine("URI Format Error: " + exc.Message);
    } catch(NotSupportedException exc) {
      Console.WriteLine("Unknown Protocol: " + exc.Message);
    } catch(IOException exc) {
      Console.WriteLine("I/O Error: " + exc.Message);
    }

    Console.WriteLine("Terminating MiniCrawler.");
}
}

}