最近在学习搜索引擎,记录一下相关信息
嵌入式数据库
- BDB
- Perst
信息过滤
- Aho-Corasick
抓取网页的方法
- 使用TCPClient
private static void TCPClientMethod()
{
TcpClient client = new TcpClient();
string hostName = "www.sina.com.cn";
int PortNumber = 8080;
try
{
client.Connect(hostName, PortNumber);
Console.Write("链接上了");
//获得返回的数据流
NetworkStream clientStream = client.GetStream();
//利用数据流构建流读取器
StreamWriter writeStream = new StreamWriter(clientStream);
writeStream.Write("GET/HTTP/1.1\r\n"
+ "User-Agent:craler request!\r\n"
+ "Host:www.sina.com.cn\r\n"
+ "Connection:Close\r\n"
+ "\r\n"
);
writeStream.Flush();
string text = "";
byte[] buffer = new byte[1024];
while (clientStream.Read(buffer, 0, 1024) > 0)
{
text = text + Encoding.UTF8.GetString(buffer);
}
Console.WriteLine(text);
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
finally
{
client.Close();
}
}
- 使用WebRequest
private static void WebRequestMethod(string urlDownLoad)
{
//构建一个HttpWebRequest对象 代表要给某个url发送http请求
HttpWebRequest request = (HttpWebRequest)System.Net.WebRequest.Create(urlDownLoad);
//获得请求的响应
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//设置编码格式
Encoding encode = Encoding.GetEncoding("utf-8");
//使用流读取器进行读取
TextReader tr = new StreamReader(response.GetResponseStream(), encode);
//从头都到尾
string htmlcontent = tr.ReadToEnd();
//获得返回的头信息
WebHeaderCollection whc = response.Headers;
for (int i = 0; i < whc.Count; i++)
{
Console.WriteLine("Header " + whc.GetKey(i) + " : " + whc[i]);
}
response.Close();
}
- 使用WebClient
private static void WebClientMethod(string url)
{
WebClient webclient = new WebClient();
Stream stream = webclient.OpenRead(url);
StreamReader reader = new StreamReader(stream);
string strResult = reader.ReadToEnd();
Console.WriteLine(strResult);
reader.Close();
stream.Close();
webclient.Dispose();
}
提交数据的方法
- GET
private static void GETMethod(string url)
{
string DefaultUserAgent = "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.2;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727)";
HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
request.Method = "GET";
//设置头信息
request.UserAgent = DefaultUserAgent;
}
- POST
private static void POSTMethod(string url)
{
HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
request.Method = "POST";
//设置头信息
request.ContentType = "application/x-www-form-urlencoded";
}
- HEAD 获取url页面的长度
private static long HEADGetWebLength(string url)
{
try
{
long length = 0;
HttpWebRequest req = (HttpWebRequest)WebRequest.CreateDefault(new Uri(url));
req.Method = "HEAD";
req.Timeout = 5000;
HttpWebResponse res = (HttpWebResponse)req.GetResponse();
if (res.StatusCode == HttpStatusCode.OK)
{
length = res.ContentLength;
}
res.Close();
return length;
}
catch (Exception ex)
{
return 0;
}
}
- webClient Post数据方法 非常简单
public static void Post(string url, string postString)
{
using (WebClient wc = new WebClient())
{
wc.Encoding = System.Text.Encoding.UTF8;
wc.Headers[HttpRequestHeader.ContentType] = "application/json";
string HtmlResult = wc.UploadString(url, postString);
Console.WriteLine(HtmlResult);
}
}