抓取网页相关方法

最近在学习搜索引擎,记录一下相关信息

嵌入式数据库

  • BDB
  • Perst

信息过滤

  • Aho-Corasick

抓取网页的方法

  • 使用TCPClient
private static void TCPClientMethod()
{
    TcpClient client = new TcpClient();
    string hostName = "www.sina.com.cn";
    int PortNumber = 8080;
    try
    {
        client.Connect(hostName, PortNumber);
        Console.Write("链接上了");
        //获得返回的数据流
        NetworkStream clientStream = client.GetStream();
        //利用数据流构建流读取器
        StreamWriter writeStream = new StreamWriter(clientStream);
        writeStream.Write("GET/HTTP/1.1\r\n"
            + "User-Agent:craler request!\r\n"
            + "Host:www.sina.com.cn\r\n"
            + "Connection:Close\r\n"
            + "\r\n"
            );
        writeStream.Flush();
        string text = "";
        byte[] buffer = new byte[1024];
        while (clientStream.Read(buffer, 0, 1024) > 0)
        {
            text = text + Encoding.UTF8.GetString(buffer);
        }
        Console.WriteLine(text);
    }
    catch (Exception ex)
    {
        Console.WriteLine(ex.Message);
    }
    finally
    {
        client.Close();
    }
}
  • 使用WebRequest
private static void WebRequestMethod(string urlDownLoad)
{
    //构建一个HttpWebRequest对象 代表要给某个url发送http请求
    HttpWebRequest request = (HttpWebRequest)System.Net.WebRequest.Create(urlDownLoad);
    //获得请求的响应
    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
    //设置编码格式
    Encoding encode = Encoding.GetEncoding("utf-8");
    //使用流读取器进行读取
    TextReader tr = new StreamReader(response.GetResponseStream(), encode);
    //从头都到尾
    string htmlcontent = tr.ReadToEnd();


    //获得返回的头信息
    WebHeaderCollection whc = response.Headers;
    for (int i = 0; i < whc.Count; i++)
    {
        Console.WriteLine("Header " + whc.GetKey(i) + " : " + whc[i]);
    }

    response.Close();
}
  • 使用WebClient
private static void WebClientMethod(string url)
{
    WebClient webclient = new WebClient();
    Stream stream = webclient.OpenRead(url);
    StreamReader reader = new StreamReader(stream);
    string strResult = reader.ReadToEnd();
    Console.WriteLine(strResult);
    reader.Close();
    stream.Close();
    webclient.Dispose();
}

提交数据的方法

  • GET
private static void GETMethod(string url)
{
    string DefaultUserAgent = "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.2;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727)";
    HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
    request.Method = "GET";
    //设置头信息
    request.UserAgent = DefaultUserAgent;
}
  • POST
private static void POSTMethod(string url)
{
    HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
    request.Method = "POST";
    //设置头信息
    request.ContentType = "application/x-www-form-urlencoded"; 
}
  • HEAD 获取url页面的长度

private static long HEADGetWebLength(string url)
{
    try
    {
        long length = 0;
        HttpWebRequest req = (HttpWebRequest)WebRequest.CreateDefault(new Uri(url));
        req.Method = "HEAD";
        req.Timeout = 5000;
        HttpWebResponse res = (HttpWebResponse)req.GetResponse();
        if (res.StatusCode == HttpStatusCode.OK)
        {
            length = res.ContentLength;
        }
        res.Close();
        return length;
    }
    catch (Exception ex)
    {
        return 0;
    }
}
  • webClient Post数据方法 非常简单
public static void Post(string url, string postString)
{
    using (WebClient wc = new WebClient())
    {
        wc.Encoding = System.Text.Encoding.UTF8;
        wc.Headers[HttpRequestHeader.ContentType] = "application/json";
        string HtmlResult = wc.UploadString(url, postString);
        Console.WriteLine(HtmlResult);
    }
} 
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值