抓取网页中需要的信息,并导出到Excel中

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;

namespace TemCrawlApp
{
    public partial class _Default : System.Web.UI.Page
    {
        protected void Page_Load(object sender, EventArgs e)
        {
        }

        protected void Button1_Click(object sender, EventArgs e)
        {
            List<itemBase> lists = new List<itemBase>();
            for (int i = 1; i < 28; i++)
            {
                string url = string.Format("http://top.chinaz.com/list.aspx?p={0}&t=247", i);//门户网站

                Encoding m_Encoding = Encoding.GetEncoding("utf-8");
                string content = GetContent(url, m_Encoding);
                if (content == "")  continue;
                lists.AddRange(GetList(content));
            }
            ExportCSV(lists);
        }
        void downLoad(List<itemBase> lists) 
        {

        }
        private string GetCsvData(List<itemBase> collections)
        {
            StringBuilder data = new StringBuilder();
            data.AppendLine();
            data.Append("网站名称" + ",");
            data.Append("网站地址" + ",");
            data.Append("建站时间" + ",");
            data.Append("网站所属" + ",");
            data.Append("所属地区" + ",");
            data.Append("创始人/团队" + ",");
            data.Append("网站类型" + ",");
            
            data.Append("Alexa排名" + ",");
            data.Append("网站简介" + "\n");
            
            if (collections != null)
            {
                foreach (itemBase item in collections)
                {
                    data.Append(item.Name + ",");
                    data.Append(item.Url + ",");
                    data.Append(item.cdTime + ",");
                    data.Append(item.classification + ",");
                    data.Append(item.Area + ",");
                    data.Append(item.Founder + ",");
                    data.Append(item.Type + ",");
                   
                    data.Append(item.AlexaRank + ",");
                    data.Append(item.Introduction + "\n");
                }
            }
            return data.ToString();
        }
        private void ExportCSV(List<itemBase> collections)
        {
            string data = GetCsvData(collections);
            string temp = string.Format("attachment;filename={0}", "ExportData.csv");
            Response.Charset = "gb2312";
            Response.ContentEncoding = Encoding.GetEncoding("gb2312");
            Response.ClearHeaders();
            Response.ContentType = "text/HTML";
            Response.AppendHeader("Content-disposition", temp);
            Response.Write(data);
            Response.End();
        }
        List<itemBase> GetList(string content)
        {
            //获得详细页面的URL
            Regex regJudgediv = new Regex(@"<li><figure>[\s\S]+?</li>");
            Regex regJudgeUrl = new Regex(@"<h3><a.+?href=""(?<href>.+?)"".+?>.+?</a>");
            MatchCollection judgeMatch;
            ArrayList listComment = new ArrayList();
            judgeMatch = regJudgediv.Matches(content);
            string judgeUrl = "";
            for (int i = 0; i < judgeMatch.Count; i++)
            {
                MatchCollection collectionComment = regJudgeUrl.Matches(judgeMatch[i].Value);
                if (collectionComment.Count > 0)
                {

                    judgeUrl = "http://top.chinaz.com"+collectionComment[0].Groups["href"].Value;
                    listComment.Add(judgeUrl);
                }
                if (judgeUrl == "")
                {
                    return new List<itemBase>();
                }
            }

            List<itemBase> lists = new List<itemBase>();

            for (int j = 0; j < listComment.Count; j++)
            {

                content = GetContent(listComment[j].ToString(),Encoding.GetEncoding("gb2312"));

                Regex regBlock = new Regex(@"<div class=""main"" role=""main"" >[\s\S]+?</script>");
                Regex regName = new Regex(@"网站名称:.+?spanwillchuanwebName""></span>(?<name>.+?)<a");
                Regex regUrl = new Regex(@"网站地址:.+?<a[^>]*>(?<url>.+?)</a>");
                Regex regcdTime = new Regex(@"建站时间:</span>(?<time>.+?)</td>");
                Regex regcf = new Regex(@"网站所属:</span>(?<cf>.+?)</td>");
                Regex regArea = new Regex(@"所属地区:</span><a[^>]*>(?<area>.+?)</td>");
                Regex regFounder = new Regex(@"创始人/团队:</span>(?<founder>.+?)</td>");
                Regex regType = new Regex(@"网站类型:</span><a[^>]*>(?<type>.+?)</td>");
                Regex regIn = new Regex(@"网站简介.*?<td[^>]*>(?<in>[\s\S]+?)</td>");
                Regex regAlexa = new Regex(@"Alexa排名.*?</span>(?<alexa>.+?)</li>");
                MatchCollection matchList;
                Match temp;
                matchList = regBlock.Matches(content);
                for (int i = 0; i < matchList.Count; i++)
                {
                    string strBlock = matchList[i].Value;
                    itemBase it = new itemBase();

                    //网站名称
                    temp = regName.Match(strBlock);
                    it.Name = temp.Groups["name"].Value.Replace("", "");
                    //网站地址
                    temp = regUrl.Match(strBlock);
                    it.Url = temp.Groups["url"].Value;
                    //建站时间
                    temp = regcdTime.Match(strBlock);
                    it.cdTime = temp.Groups["time"].Value;  
                    //网站所属
                    temp = regcf.Match(strBlock);
                    it.classification = temp.Groups["cf"].Value;
                    //所属地区
                    temp = regArea.Match(strBlock);
                    var tempArea = temp.Groups["area"].Value.Replace("</a>", "").Replace(",", ",");
                    it.Area = Regex.Replace(tempArea, @"<a[^>]*>", "");
                    //创始人/团队
                    temp = regFounder.Match(strBlock);
                    it.Founder = temp.Groups["founder"].Value;
                    //网站类型
                    temp = regType.Match(strBlock);
                    var tempType = temp.Groups["type"].Value.Replace("</a>", "").Replace(",", ",");
                    it.Type = Regex.Replace(tempType, @"<a[^>]*>", "");
                    //网站简介
                    temp = regIn.Match(strBlock);
                    it.Introduction = temp.Groups["in"].Value.Replace(",", ",").Trim();
                    it.Introduction = Regex.Replace(it.Introduction,@"\s+","");
                    //Alexa排名
                    temp = regAlexa.Match(strBlock);
                    it.AlexaRank = temp.Groups["alexa"].Value;

                    lists.Add(it);
                }
                
            }
            return lists;
        }
        class itemBase
        {
            public string Name;
            public string Url;
            public string cdTime;
            public string classification;
            public string Area;
            public string Founder;
            public string Type;
            public string Introduction;
            public string AlexaRank;
        }
        private string GetContent(string URL, Encoding encodingFormat)
        {
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            CookieContainer cc = new CookieContainer();
            StreamReader sr = null;
            string strCookies = string.Empty;
            string content = "";
            try
            {
                request = (HttpWebRequest)WebRequest.Create(URL);//实例化web访问类
                request.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1";
                request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
                request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"); 
                request.ContentType = "application/x-www-form-urlencoded";//模拟头
                request.AllowAutoRedirect = false;
                request.CookieContainer = cc;
                request.KeepAlive = true;
                request.Method = "GET";
                request.KeepAlive = true;
                request.Headers.Add("Cookie:" + strCookies);
                response = (HttpWebResponse)request.GetResponse();
                //设置cookie   
                strCookies = request.CookieContainer.GetCookieHeader(request.RequestUri);
                //取再次跳转链接   
                sr = new StreamReader(response.GetResponseStream(), encodingFormat);
                content = sr.ReadToEnd();
            }
            catch (Exception ex)//GET出错
            {
                Console.WriteLine("远程服务器返回错误"+URL);
                return "";
            }
            return content;
        }
    }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值