using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
namespace TemCrawlApp
{
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
protected void Button1_Click(object sender, EventArgs e)
{
List<itemBase> lists = new List<itemBase>();
for (int i = 1; i < 28; i++)
{
string url = string.Format("http://top.chinaz.com/list.aspx?p={0}&t=247", i);//门户网站
Encoding m_Encoding = Encoding.GetEncoding("utf-8");
string content = GetContent(url, m_Encoding);
if (content == "") continue;
lists.AddRange(GetList(content));
}
ExportCSV(lists);
}
void downLoad(List<itemBase> lists)
{
}
private string GetCsvData(List<itemBase> collections)
{
StringBuilder data = new StringBuilder();
data.AppendLine();
data.Append("网站名称" + ",");
data.Append("网站地址" + ",");
data.Append("建站时间" + ",");
data.Append("网站所属" + ",");
data.Append("所属地区" + ",");
data.Append("创始人/团队" + ",");
data.Append("网站类型" + ",");
data.Append("Alexa排名" + ",");
data.Append("网站简介" + "\n");
if (collections != null)
{
foreach (itemBase item in collections)
{
data.Append(item.Name + ",");
data.Append(item.Url + ",");
data.Append(item.cdTime + ",");
data.Append(item.classification + ",");
data.Append(item.Area + ",");
data.Append(item.Founder + ",");
data.Append(item.Type + ",");
data.Append(item.AlexaRank + ",");
data.Append(item.Introduction + "\n");
}
}
return data.ToString();
}
private void ExportCSV(List<itemBase> collections)
{
string data = GetCsvData(collections);
string temp = string.Format("attachment;filename={0}", "ExportData.csv");
Response.Charset = "gb2312";
Response.ContentEncoding = Encoding.GetEncoding("gb2312");
Response.ClearHeaders();
Response.ContentType = "text/HTML";
Response.AppendHeader("Content-disposition", temp);
Response.Write(data);
Response.End();
}
List<itemBase> GetList(string content)
{
//获得详细页面的URL
Regex regJudgediv = new Regex(@"<li><figure>[\s\S]+?</li>");
Regex regJudgeUrl = new Regex(@"<h3><a.+?href=""(?<href>.+?)"".+?>.+?</a>");
MatchCollection judgeMatch;
ArrayList listComment = new ArrayList();
judgeMatch = regJudgediv.Matches(content);
string judgeUrl = "";
for (int i = 0; i < judgeMatch.Count; i++)
{
MatchCollection collectionComment = regJudgeUrl.Matches(judgeMatch[i].Value);
if (collectionComment.Count > 0)
{
judgeUrl = "http://top.chinaz.com"+collectionComment[0].Groups["href"].Value;
listComment.Add(judgeUrl);
}
if (judgeUrl == "")
{
return new List<itemBase>();
}
}
List<itemBase> lists = new List<itemBase>();
for (int j = 0; j < listComment.Count; j++)
{
content = GetContent(listComment[j].ToString(),Encoding.GetEncoding("gb2312"));
Regex regBlock = new Regex(@"<div class=""main"" role=""main"" >[\s\S]+?</script>");
Regex regName = new Regex(@"网站名称:.+?spanwillchuanwebName""></span>(?<name>.+?)<a");
Regex regUrl = new Regex(@"网站地址:.+?<a[^>]*>(?<url>.+?)</a>");
Regex regcdTime = new Regex(@"建站时间:</span>(?<time>.+?)</td>");
Regex regcf = new Regex(@"网站所属:</span>(?<cf>.+?)</td>");
Regex regArea = new Regex(@"所属地区:</span><a[^>]*>(?<area>.+?)</td>");
Regex regFounder = new Regex(@"创始人/团队:</span>(?<founder>.+?)</td>");
Regex regType = new Regex(@"网站类型:</span><a[^>]*>(?<type>.+?)</td>");
Regex regIn = new Regex(@"网站简介.*?<td[^>]*>(?<in>[\s\S]+?)</td>");
Regex regAlexa = new Regex(@"Alexa排名.*?</span>(?<alexa>.+?)</li>");
MatchCollection matchList;
Match temp;
matchList = regBlock.Matches(content);
for (int i = 0; i < matchList.Count; i++)
{
string strBlock = matchList[i].Value;
itemBase it = new itemBase();
//网站名称
temp = regName.Match(strBlock);
it.Name = temp.Groups["name"].Value.Replace(" ", "");
//网站地址
temp = regUrl.Match(strBlock);
it.Url = temp.Groups["url"].Value;
//建站时间
temp = regcdTime.Match(strBlock);
it.cdTime = temp.Groups["time"].Value;
//网站所属
temp = regcf.Match(strBlock);
it.classification = temp.Groups["cf"].Value;
//所属地区
temp = regArea.Match(strBlock);
var tempArea = temp.Groups["area"].Value.Replace("</a>", "").Replace(",", ",");
it.Area = Regex.Replace(tempArea, @"<a[^>]*>", "");
//创始人/团队
temp = regFounder.Match(strBlock);
it.Founder = temp.Groups["founder"].Value;
//网站类型
temp = regType.Match(strBlock);
var tempType = temp.Groups["type"].Value.Replace("</a>", "").Replace(",", ",");
it.Type = Regex.Replace(tempType, @"<a[^>]*>", "");
//网站简介
temp = regIn.Match(strBlock);
it.Introduction = temp.Groups["in"].Value.Replace(",", ",").Trim();
it.Introduction = Regex.Replace(it.Introduction,@"\s+","");
//Alexa排名
temp = regAlexa.Match(strBlock);
it.AlexaRank = temp.Groups["alexa"].Value;
lists.Add(it);
}
}
return lists;
}
class itemBase
{
public string Name;
public string Url;
public string cdTime;
public string classification;
public string Area;
public string Founder;
public string Type;
public string Introduction;
public string AlexaRank;
}
private string GetContent(string URL, Encoding encodingFormat)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
CookieContainer cc = new CookieContainer();
StreamReader sr = null;
string strCookies = string.Empty;
string content = "";
try
{
request = (HttpWebRequest)WebRequest.Create(URL);//实例化web访问类
request.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1";
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
request.ContentType = "application/x-www-form-urlencoded";//模拟头
request.AllowAutoRedirect = false;
request.CookieContainer = cc;
request.KeepAlive = true;
request.Method = "GET";
request.KeepAlive = true;
request.Headers.Add("Cookie:" + strCookies);
response = (HttpWebResponse)request.GetResponse();
//设置cookie
strCookies = request.CookieContainer.GetCookieHeader(request.RequestUri);
//取再次跳转链接
sr = new StreamReader(response.GetResponseStream(), encodingFormat);
content = sr.ReadToEnd();
}
catch (Exception ex)//GET出错
{
Console.WriteLine("远程服务器返回错误"+URL);
return "";
}
return content;
}
}
}
抓取网页中需要的信息,并导出到Excel中
最新推荐文章于 2024-07-11 13:53:16 发布