C# 网页信息采集

using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;
using System.Data.SqlClient;

namespace WebBee
{
    #region CtripInfo
    /// <summary>
    ///
携程网信息类

    /// </summary>
    public class CtripInfo
    {

        #region Property
        private int mid;
        /// <summary>
        /// PK
        /// </summary>
        public int ID
        {
            get { return mid; }
            set { mid = value; }
        }
        private string mCity;

        public string City
        {
            get { return mCity; }
            set { mCity = value; }
        }

        private int mStarLevel;

        public int StarLevel
        {
            get { return mStarLevel; }
            set { mStarLevel = value; }
        }

        private string mtitle;

        public string Title
        {
            get { return mtitle; }
            set { mtitle = value; }
        }
        private string mNearObviousPlace;

        public string NearObviousPlace
        {
            get { return mNearObviousPlace; }
            set { mNearObviousPlace = value; }
        }

        private string mRemark;
        /// <summary>
        ///
简述

        /// </summary>
        public string Remark
        {
            get { return mRemark; }
            set { mRemark = value; }
        }
        private string mAddress;
        /// <summary>
        ///
地址
        /// </summary>
        public string Address
        {
            get { return mAddress; }
            set { mAddress = value; }
        }
        private string mPostcode;

        /// <summary>
        ///
邮编

        /// </summary>
        public string Postcode
        {
            get { return mPostcode; }
            set { mPostcode = value; }
        }
        private string mTel;
        /// <summary>
        ///
电话
        /// </summary>
        public string Tel
        {
            get { return mTel; }
            set { mTel = value; }
        }

        private string mNearAirdrome;
        /// <summary>
        ///
离机场距离

        /// </summary>
        public string NearAirdrome
        {
            get { return mNearAirdrome; }
            set { mNearAirdrome = value; }
        }
        private string mNearDepot;
        /// <summary>
        ///
离火车站公里数
        /// </summary>
        public string NearDepot
        {
            get { return mNearDepot; }
            set { mNearDepot = value; }
        }
        private string mNearCityCenter;
        /// <summary>
        ///
离市中心距离数
        /// </summary>
        public string NearCityCenter
        {
            get { return mNearCityCenter; }
            set { mNearCityCenter = value; }
        }
        private string mNearAsiaGame;
        /// <summary>
        ///
离亚运村距离数
        /// </summary>
        public string NearAsiaGame
        {
            get { return mNearAsiaGame; }
            set { mNearAsiaGame = value; }
        }
        private string mNearAround;
        /// <summary>
        ///
周围环境    
        /// </summary>
        public string NearAround
        {
            get { return mNearAround; }
            set { mNearAround = value; }
        }
        private string mAppendService;
        /// <summary>
        ///
附加服务

        /// </summary>
        public string AppendService
        {
            get { return mAppendService; }
            set { mAppendService = value; }
        }
        private string mServiceMode;
        /// <summary>
        ///
宾馆服务项目
        /// </summary>
        public string ServiceMode
        {
            get { return mServiceMode; }
            set { mServiceMode = value; }
        }
        private string mEstablishment;
        /// <summary>
        ///
宾馆餐饮设施
        /// </summary>
        public string Establishment
        {
            get { return mEstablishment; }
            set { mEstablishment = value; }
        }
        private string mPastime;
        /// <summary>
        ///
娱乐与健身设施
        /// </summary>
        public string Pastime
        {
            get { return mPastime; }
            set { mPastime = value; }
        }
        private string mBankCard;
        /// <summary>
        ///
可用信用卡类型
        /// </summary>
        public string BankCard
        {
            get { return mBankCard; }
            set { mBankCard = value; }
        }
        /// <summary>
        ///
离酒店ID
        /// </summary>
        private int mHotelID;

        public int HotelID
        {
            get { return mHotelID; }
            set { mHotelID = value; }
        }

 

 

 

 

        #endregion

        #region Method

        /// <summary>
        ///
获取标题
<div > <div >
        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetTitle(string line)
        {
            return DropHTMLTag(line);
        }
        /// <summary>
        ///
获取简述
<div >  <div >
        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetRemark(string line)
        {
            return DropHTMLTag(line.Replace("&nbsp;", ""));
        }

        /// <summary>
        ///
获取星级
<div >  <div >
        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetStarLevel(string line)
        {
            int start = line.IndexOf("
星级:
") + 3;
            if (line.IndexOf("
星级:
") < 1)
            {
                return "0";
            }
            int end = line.IndexOf("</p>");
            if (end < 1)
            {
                return "0";
            }
            string tmp = line.Substring(start, end - start);
            int i = 0, count = 0;
            while (tmp.IndexOf("</span>", i) > 0)
            {
                i = tmp.IndexOf("</span>", i) + 7;
                count++;
            }
            return count.ToString();

        }
        /// <summary>
        ///
获取地址

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetAddress(string line)
        {

            int start = line.IndexOf("地址:") + 3;
            if (line.IndexOf("
地址:
") < 1)
            {
                return "
未知
";
            }

            int end = line.IndexOf("</p>", start);
            return DropHTMLTag(line.Substring(start, end - start));

        }

        /// <summary>
        ///
获取邮编

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetPostCode(string line)
        {
            int start = line.IndexOf("
邮编:") + 3;
            if (line.IndexOf("
邮编:
") < 1)
            {
                return "
未知
";
            }

            int end = line.IndexOf("</p>", start);
            return DropHTMLTag(line.Substring(start, end - start));
        }

        /// <summary>
        ///
获取电话

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetTel(string line)
        {
            int start = line.IndexOf("
电话:") + 3;
            if (line.IndexOf("
电话:
") < 1)
            {
                return "
未知
";
            }

            int end = line.IndexOf("</p>", start);
            return DropHTMLTag(line.Substring(start, end - start));

        }

        /// <summary>
        ///
获取离机场距离(公里)

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetNearAirdrome(string line)
        {
            int start = line.IndexOf("
离机场距离(公里):") + 10;
            if (line.IndexOf("
离机场距离(公里):
") < 1)
            {
                return "
未知
";
            }

            int end = line.IndexOf("</p>", start);
            return DropHTMLTag(line.Substring(start, end - start));

        }

        /// <summary>
        ///
获取离火车站距离(公里)

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetNearDepot(string line)
        {
            int start = line.IndexOf("
火车站距离(公里):") + 10;
            if (line.IndexOf("
火车站距离(公里):
") < 1)
            {
                return "
未知
";
            }

            int end = line.IndexOf("</p>", start);
            return DropHTMLTag(line.Substring(start, end - start));

        }
        /// <summary>
        ///
获取离市中心距离(公里):

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetCitycenter(string line)
        {

            int start = line.IndexOf("离市中心距离(公里):") + 11;
            if (line.IndexOf("
离市中心距离(公里):
") < 1)
            {
                return "
未知
";
            }

            int end = line.IndexOf("</p>", start);
            return DropHTMLTag(line.Substring(start, end - start));

        }

        /// <summary>
        ///
获取周边明显建筑

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetNearObviousPlace(string line)
        {
            try
            {
                int start = line.IndexOf("
离市中心距离(公里):") + 11;
                if (line.IndexOf("
离市中心距离(公里):
") < 1)
                    return "
未知
";
                if (line.IndexOf("
周围环境:
") < 1)
                    return "
未知
";
                int end = line.IndexOf("
周围环境:", start);

                string tmp = line.Substring(start, end - start);
                start = tmp.IndexOf("<p>");
                end = tmp.IndexOf("</p>", start);
                if (end < start)
                    return "
未知
";
                return DropHTMLTag(tmp.Substring(start, end - start));
            }
            catch
            {
                return "
未知
";
            }

        }

 

        /// <summary>
        ///
获取周围环境

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetNearAround(string line)
        {

            int start = line.IndexOf("周围环境:") + 5;
            if (line.IndexOf("
周围环境:
") < 1)
            {
                return "
未知
";
            }

            int end = line.IndexOf("</p>", start);
            return DropHTMLTag(line.Substring(start, end - start));

        }

        /// <summary>
        ///
获取附加

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetAppendService(string line)
        {

            int start = line.IndexOf("附加</span>选择</p>") + 15;
            if (line.IndexOf("
附加</span>选择
</p>") < 1)
            {
                return "
未知
";
            }

            int end = line.IndexOf("<p", start);

            return DropHTMLTag(line.Substring(start, end - start));

        }
        /// <summary>
        ///
获得宾馆服务项目

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetServiceMode(string line)
        {
            int start = line.IndexOf("
宾馆服务</span>项目") + 13;

            if (line.IndexOf("宾馆服务</span>项目") < 1)
            {
                return "
未知
";
            }

            int end = line.IndexOf("</div>", start);

            return DropHTMLTag(line.Substring(start, end - start)).Replace("&nbsp;", " ");
        }
        /// <summary>
        ///
获得宾馆娱乐与健身

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetPastime(string line)
        {

            int start = line.IndexOf("宾馆娱乐与健身</span>设施") + 16;

            if (line.IndexOf("宾馆娱乐与健身</span>设施") < 1)
            {
                return "
未知
";
            }

            int end = line.IndexOf("</div>", start);

            return DropHTMLTag(line.Substring(start, end - start)).Replace("&nbsp;", " ");

        }
        /// <summary>
        ///
获得宾馆餐饮设施

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetEstablishment(string line)
        {
            int start = line.IndexOf("
宾馆餐饮</span>设施</p>") + 17;

            if (line.IndexOf("宾馆餐饮</span>设施</p>") < 1)
            {
                return "
未知
";
            }

            int end = line.IndexOf("</div>", start);

            return DropHTMLTag(line.Substring(start, end - start)).Replace("&nbsp;", " ");

        }
        /// <summary>
        ///
获得可接受信用卡类型

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string GetCard(string line)
        {

            int start = line.IndexOf("信用卡</span>类型") + 12;

            if (line.IndexOf("信用卡</span>类型") < 1)
            {
                return "
未知
";
            }

            int end = line.IndexOf("</div>", start);

            string tmp = line.Substring(start, end - start);
            int i = 0;
            string s = "";
            while (tmp.IndexOf("alt=", i) > 0)
            {
                i = tmp.IndexOf("alt=", i) + 4;
                end = tmp.IndexOf(" src=", i) == -1 ? tmp.Length : tmp.IndexOf(" src=", i);
                s += tmp.Substring(i, end - i);
            }
            if (i == 0 || string.Empty == s.Trim())
                return "
未知
";
            return s.Replace("/"", "").Replace("&nbsp;", "").Replace("<img", " ").Replace(">", "");

        }
        /// <summary>
        ///
酒店价钱情况

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public List<string[]> AddHotelPrice(string line)
        {
            line = line.Replace("/r/n", "").Replace("&nbsp;", "").Replace("/t", "").ToLower();
            List<string[]> table = new List<string[]>();
            int start = line.IndexOf("<thead>");
            int end = line.IndexOf("</thead>", start);
            if (end < -1)
                return null;
            string colum = line.Substring(start, end - start).Replace("</th>", "
");
            colum = colum.Remove(colum.LastIndexOf("
"));
            colum = DropHTMLTag(colum).Replace(" ", "");
            string[] title = colum.Split(new char[] { '
' });

            table.Add(title);

            start = line.IndexOf("<tbody>", end);
            end = line.IndexOf("</tbody>", start);
            string tbody = line.Substring(start, end - start);

            int i = 0;
            while (tbody.IndexOf("<tr>", i) > 0)
            {
                i = tbody.IndexOf("<tr>", i) + 4;
                end = tbody.IndexOf("<tr>", i) == -1 ? tbody.Length : tbody.IndexOf("<tr>", i);
                colum = tbody.Substring(i, end - i);
                colum = colum.Replace("</td>", "
");
                colum = ReplaceSingleQuotes(DropHTMLTag(colum).Replace(" ", ""));
                title = colum.Split(new char[] { '
' });
                table.Add(title);
            }
            //i = tbody.IndexOf("<tr>", i) + 4;
            //end = tbody.IndexOf("<tr>", i) == -1 ? tbody.Length : tbody.IndexOf("<tr>", i);
            //colum = tbody.Substring(i, end - i);
            //colum = colum.Replace("</td>", "
");
            //colum = ReplaceSingleQuotes(DropHTMLTag(colum).Replace(" ", "").Replace("&nbsp", ""));
            //title = colum.Split(new char[] { '
' });
            //table.Add(title);
            return table;
        }

        private List<string[]> mHotelPrice;
        /// <summary>
        ///
酒店价钱信息

        /// </summary>
        public List<string[]> HotelPrice
        {
            get { return mHotelPrice; }
            set { mHotelPrice = value; }
        }

 

        /// <summary>
        ///
HotelID返回实例

        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public CtripInfo GetCtripInfoByHotelID(int hid)
        {
            CtripInfo ci = new CtripInfo();
            string html = GetWebContent("
http://www.ctrip.com/Supermarket/Hotel/Hotel.asp?Hotel=" + hid.ToString(), Encoding.Default);
            if (null != html && string.Empty != html && html.IndexOf("
非常抱歉") < 1)
            {
                ci.HotelID = hid;
                int start = html.IndexOf("<div pubHotels_conTitle01_title/">");
                int end = html.IndexOf("</div>", start);
                ci.Title = GetTitle(html.Substring(start, end - start)).Replace("/'", "`");

                start = html.IndexOf("<div pubHotels_indStyle01/">");
                if (start < 1)
                {
                    ci.Remark = "
没有简述
";
                }
                else
                {
                    end = html.IndexOf("</div>", start);
                    ci.Remark = GetRemark(html.Substring(start, end - start));
                    ci.Remark = ReplaceSingleQuotes(ci.Remark);
                }


                start = html.IndexOf("<div priHtlChoice_hafLeft/">");

                end = html.IndexOf("</div>", start);
                ci.Address = ReplaceSingleQuotes(GetAddress(html.Substring(start, end - start)));
                if (ci.Address != "
未知
")
                {
                    ci.City = ci.Address.Substring(0, ci.Address.IndexOf(" "));
                }
                else
                {
                    ci.City = "
未知地区
";
                }
                ci.StarLevel = int.Parse(GetStarLevel(html.Substring(start, end - start)));
                ci.Postcode = GetPostCode(html.Substring(start, end - start));
                ci.Tel = GetTel(html.Substring(start, end - start));

                start = html.IndexOf("<div priHtlChoice_hafRight/">");
                end = html.IndexOf("</div>", start);
                ci.NearAirdrome = ReplaceSingleQuotes(GetNearAirdrome(html.Substring(start, end - start)));
                ci.NearAround = ReplaceSingleQuotes(GetNearAround(html.Substring(start, end - start)));
                ci.NearDepot = ReplaceSingleQuotes(GetNearDepot(html.Substring(start, end - start)));
                ci.NearObviousPlace = ReplaceSingleQuotes(GetNearObviousPlace(html.Substring(start, end - start)));
                ci.NearCityCenter = ReplaceSingleQuotes(GetCitycenter(html.Substring(start, end - start)));

                if (html.IndexOf("附加</span>选择</p>") > 0)
                {
                    start = html.IndexOf("
附加</span>选择
</p>") - 15;
                    end = html.IndexOf("<p", start) + 2;
                    ci.AppendService = ReplaceSingleQuotes(GetAppendService(html.Substring(start, end - start)));
                }
                else
                {
                    ci.AppendService = "
未知
";
                }

                if (html.IndexOf("宾馆服务</span>项目") > 0)
                {
                    start = html.IndexOf("
宾馆服务</span>项目
") - 13;
                    end = html.IndexOf("<p", start) + 2;
                    ci.ServiceMode = ReplaceSingleQuotes(GetServiceMode(html.Substring(start, end - start)));
                }
                else
                {
                    ci.ServiceMode = "
未知
";
                }

                if (html.IndexOf("宾馆餐饮</span>设施</p>") > 0)
                {
                    start = html.IndexOf("
宾馆餐饮</span>设施
</p>") - 17;
                    end = html.IndexOf("<p", start) + 2;
                    ci.Establishment = ReplaceSingleQuotes(GetEstablishment(html.Substring(start, end - start)));
                }
                else
                {
                    ci.Establishment = "
未知
";
                }

                if (html.IndexOf("宾馆娱乐与健身</span>设施") > 0)
                {
                    start = html.IndexOf("
宾馆娱乐与健身</span>设施
") - 16;
                    end = html.IndexOf("<p", start) + 2;
                    ci.Pastime = ReplaceSingleQuotes(GetPastime(html.Substring(start, end - start)));
                }
                else
                {
                    ci.Pastime = "
未知
";
                }
                if (html.IndexOf("
信用卡</span>类型
") > 0)
                {
                    start = html.IndexOf("
信用卡</span>类型
") - 12;
                    end = html.IndexOf("</div>", start) + 6;
                    ci.BankCard = ReplaceSingleQuotes(GetCard(html.Substring(start, end - start)));
                }
                else
                {
                    ci.BankCard = "
未知
";
                }
                if (html.IndexOf("
酒店房型、房价
</p>") > 0)
                {
                    start = html.IndexOf("
酒店房型、房价
</p>");
                    end = html.IndexOf("</table>", start);
                    ci.HotelPrice = AddHotelPrice(html.Substring(start, end - start));
                }
                else
                {
                    ci.HotelPrice = null;
                }


                return ci;
            }
            return null;


        }
        /// <summary>
        ///
删除HTML标识
        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string DropHTMLTag(string htmlString)
        {
            htmlString = Regex.Replace(htmlString, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"([/r/n])[/s]+", "", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"-->", "", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
            htmlString = Regex.Replace(htmlString, @"<!--.*", "", RegexOptions.IgnoreCase);
            //htmlString = Regex.Replace(htmlString, @"&(quot|#34);", "/"", RegexOptions.IgnoreCase);
            //htmlString = Regex.Replace(htmlString, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
            //htmlString = Regex.Replace(htmlString, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
            //htmlString = Regex.Replace(htmlString, @"&(iexcl|#161);", "/xa1", RegexOptions.IgnoreCase);
            //htmlString = Regex.Replace(htmlString, @"&(cent|#162);", "/xa2", RegexOptions.IgnoreCase);
            //htmlString = Regex.Replace(htmlString, @"&(pound|#163);", "/xa3", RegexOptions.IgnoreCase);
            //htmlString = Regex.Replace(htmlString, @"&(copy|#169);", "/xa9", RegexOptions.IgnoreCase);
            //htmlString = Regex.Replace(htmlString, @"&#(/d+);", "", RegexOptions.IgnoreCase);

            //htmlString.Replace("<", "");
            //htmlString.Replace(">", "");
            //htmlString.Replace("/r/n", "");


            return htmlString;
        }
        /// <summary>
        ///
替换单引号成中文的单引号
        /// </summary>
        /// <param ></param>
        /// <returns></returns>
        public string ReplaceSingleQuotes(string hTML)
        {
            return hTML.Replace("/'", "`");
        }
        /// <summary>
        ///
根据Url地址得到网页的html源码
        /// </summary>
        /// <param ></param>
        /// <param ></param>
        /// <returns></returns>
        public string GetWebContent(string Url, Encoding encoding)
        {
            string strResult = "";
            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
                //
声明一个HttpWebRequest请求
                request.Timeout = 30000;
                //
设置连接超时时间
                request.Headers.Set("Pragma", "no-cache");
                // request.Headers.Set("KeepAlive", "true");
                request.CookieContainer = new CookieContainer();
                request.Credentials = CredentialCache.DefaultCredentials;
                request.Referer = Url;

                request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";

                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                Stream streamReceive = response.GetResponseStream();

                StreamReader streamReader = new StreamReader(streamReceive, encoding);
                strResult = streamReader.ReadToEnd();
                streamReceive.Close();
                streamReader.Close();
                streamReceive = null;
                streamReader = null;
            }
            catch
            {
                return "";
            }
            return strResult;
        }
        /// <summary>
        ///
增加酒店

        /// </summary>
        /// <param ></param>
        /// <returns>bool</returns>
        public int AddHotel(CtripInfo ci)
        {
            string sql = @"Begin  INSERT INTO Ctrip(HotelID,title,StarLevel,City,remark, Address, Postcode, Tel, NearDepot, NearAirdrome, NearCityCenter,NearObviousPlace,NearAround, AppendService, Pastime, Establishment, ServiceMode, BankCard) VALUES ({0}, '{1}', {2}, '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}', '{13}','{14}','{15}','{16}','{17}');";

            sql = string.Format(sql, ci.HotelID, ci.Title, ci.StarLevel, ci.City, ci.Remark, ci.Address, ci.Postcode, ci.Tel, ci.NearDepot, ci.NearAirdrome, ci.NearCityCenter, ci.NearObviousPlace, ci.NearAround, ci.AppendService, ci.Pastime, ci.Establishment, ci.ServiceMode, ci.BankCard);
            sql = sql + @" SELECT @@IDENTITY AS 'Identity'; end";
            SqlDataReader sr = DBHelper.ExecuteReader(sql);
            int tmp = 0;
            if (sr.Read())
            {
                tmp = int.Parse(sr.GetValue(0).ToString());
            }
            sr = null;
            return tmp;


        }

        public int LastID()
        {

            return DBHelper.GetMaxID("id", "Ctrip");
        }
        public bool AddHotelPrice(string[] s, int id)
        {
            string sql = "INSERT INTO HotelPrice(cid,RoomType, RoomOldPrice, RoomNewPrice, BroadBand, BedType) VALUES ({0},'{1}','{2}','{3}','{4}','{5}')";
            sql = string.Format(sql, id, s[1], s[2], s[3], s[4], s[5]);
            return DBHelper.ExecuteSql(sql) > 0;
        }
        #endregion

    }
    #endregion

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值