8、多层次网页数据抓取

最新推荐文章于 2023-01-29 09:49:04 发布
william11zhu
最新推荐文章于 2023-01-29 09:49:04 发布
阅读量905
点赞数
分类专栏：学习笔记之C#
本文链接：https://blog.csdn.net/jj547139491/article/details/9668163
版权
学习笔记之C# 专栏收录该内容
9 篇文章 0 订阅
订阅专栏
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Threading;
using System.Data.OleDb;

namespace Institute
{
    public partial class Form1 : Form
    {
        public Form1()
        {

            InitializeComponent();

        }

        private DataTable dtCondition = new DataTable("Condition");
        public DataTable DtCondition
        {
            set { dtCondition = value; }
            get { return dtCondition; }
        }

        private void btnStart_Click(object sender, EventArgs e)
        {

            ReadCondition();
            CatchPage("http://itp.ne.jp");
            MessageBox.Show("URL获取完毕！");
        }

        private void btnGetData_Click(object sender, EventArgs e)
        {
            CatchArea();
        }

        #region CatchPage
        //获取URL
        public void CatchPage(string requestUrl)
        {
            //获取页面并解析出所需的URL
            string[] groupArray = GenreSearch(webRequest(requestUrl));

            //dtCondition.Columns.Add("URL");
            for ( int i = 0; i < DtCondition.Rows.Count; i++ )
            {
                int key = 1;
                for ( int j = 0; j < groupArray.Length - 1; j++ )
                {
                    string strCondition = Regex.Match(groupArray[j], ">.*?</h2>|>.*").Value.Replace(">", "").Replace("</h2", "").Trim();
                    if ( ( DtCondition.Rows[i][0].ToString() ) == strCondition )
                    {
                        MatchCollection request = Regex.Matches(groupArray[j], "\".*?\"");
                        if ( DtCondition.Rows[i][key].ToString() != "" )
                        {
                            CatchChildPage("http://itp.ne.jp" + request[0].Value, key++, i);
                        }
                        else
                        {
                            CatchChildPage("http://itp.ne.jp" + request[0].Value, "LastGenList.txt");
                        }
                        break;
                    }
                }
            }
        }
        #endregion

        #region CatchChildPage_1
        //有下级条件
        public void CatchChildPage(string requestUrl, int key, int i)
        {
            requestUrl = requestUrl.Replace("\"", "");
            string[] groupArray = GetHref(webRequest(requestUrl));
            //DataTable dtLastUrl = null;
            for ( int j = 1; j < groupArray.Length; j++ )
            {
                string strCondition = Regex.Match(groupArray[j], @">.*?<").Value;
                strCondition = strCondition.Substring(1, strCondition.IndexOf('(') - 1).Trim();
                if ( strCondition.Equals(DtCondition.Rows[i][key].ToString()) && key < 4 )
                {
                    MatchCollection request = Regex.Matches(groupArray[j], "\".*?\"");
                    if ( DtCondition.Rows[i][++key].ToString() != "" )
                    {
                        //调用CatchChildPage_1
                        CatchChildPage("http://itp.ne.jp" + request[1].Value, key, i);
                    }
                    else
                    {
                        //调用CatchChildPage_2
                        CatchChildPage("http://itp.ne.jp" + request[0].Value);
                    }
                    break;
                }
            }
        }
        #endregion

        #region CatchChildPage_2
        //没有下级条件
        public void CatchChildPage(string requestUrl)
        {

            requestUrl = requestUrl.Replace("\"", "");
            string[] groupArray = GetHref(webRequest(requestUrl));
            //DataTable dtLastUrl = new DataTable("LastUrl");
            //dtLastUrl.Columns.Add();
            for ( int j = 1; j < groupArray.Length; j++ )
            {
                MatchCollection request = Regex.Matches(groupArray[j], "\".*?\"");
                string strCount = Regex.Match(groupArray[j], @"\(.*?\)").Value;
                //匹配到1一个以上的URL说明还有下一级，继续向下匹配
                if ( request.Count > 1 )
                {
                    CatchChildPage("http://itp.ne.jp" + request[1].Value);
                }
                //匹配到1个URL且没有下一级
                else if ( request.Count == 1 && strCount != "(0件)" )
                {
                    //调用CatchChildPage_3
                    CatchChildPage("http://itp.ne.jp" + request[0].Value, "LastGenList.txt");

                }
            }
        }
        #endregion


        #region CatchChildPage_3
        //取都省府县各级的URL
        public void CatchChildPage(string requestUrl, string txt)
        {
            requestUrl = requestUrl.Replace("\"", "");
            string[] groupArray = GetHref(webRequest(requestUrl));
            for ( int j = 1; j < groupArray.Length; j++ )
            {
                MatchCollection request = Regex.Matches(groupArray[j], "\".*?\"");
                string strCount = Regex.Match(groupArray[j], @"\(.*?\)").Value;
                //匹配到1一个以上的URL说明还有下一级，继续向下匹配
                if ( request.Count > 1 )
                {
                    CatchChildPage("http://itp.ne.jp" + request[1].Value, txt);
                }
                //匹配到1个URL且没有下一级
                else if ( request.Count == 1 && strCount != "(0件)" )
                {
                    using ( StreamWriter writer = new StreamWriter(Application.StartupPath + @"\" + txt, true) )
                    {
                        writer.WriteLine("http://itp.ne.jp" + request[0].Value.Replace("\"", "").Replace("amp;", ""));
                    }
                }
            }
        }
        #endregion

        #region webRequest
        //获取requestUrl所指向的页面
        public string webRequest(string requestUrl)
        {
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            string results = null;
            try
            {
                request = (HttpWebRequest)WebRequest.Create(requestUrl);
                //代理设置
                request.Proxy = new WebProxy("http://66.35.68.145:3128");
                request.Method = "GET";
                request.Timeout = 1000 * 60 * 2;
                request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
                request.KeepAlive = true;
                request.ReadWriteTimeout = 1000 * 60 * 2;
                response = (HttpWebResponse)request.GetResponse();
                using ( StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("Shift_JIS")) )
                {
                    results = reader.ReadToEnd();
                }
            }
            catch
            {
                //如果捕获异常，则延迟1秒
                Thread.Sleep(1000);
            }
            finally
            {
                try
                {
                    response.Close();
                }
                catch
                {
                    //捕获到response异常，则重新执行GetGenre函数（一般response异常为response=null，既连接异常）
                    results = webRequest(requestUrl);
                }
            }
            //如果results的值为空则重新执行GetGenre函数（此种情况一般发生在连接断开时间比较长的情况下）
            if ( results == null )
            {
                results = webRequest(requestUrl);
            }

            return results;
        }
        #endregion

        #region GetHref
        //从获取的页面中解析出所需的数据
        public string[] GetHref(string results)
        {
            results = results.Substring(results.IndexOf("<li>"));
            results = results.Remove(results.IndexOf("</ul>"));
            results = Regex.Replace(results, "<(?!a|/a|li).*?>|\r|\n|\t", "");
            results = Regex.Replace(results, @"<input[^>]*/?>", "");
            string[] group = Regex.Split(results, "<li>");
            return group;
        }
        #endregion

        //解析主页中的数据
        public string[] GenreSearch(string results)
        {
            results = results.Substring(results.IndexOf("<h1>ジャンルから探す</h1>"));
            results = results.Remove(results.IndexOf("</div>")).Replace(" ", "").Replace("<br>", "").Replace("\n", "");
            results = Regex.Replace(results, @"<span>.*?</span>", "", RegexOptions.IgnoreCase);
            results = Regex.Replace(results, @"<(?!a|/a|/h2).*?>|\r|\n|\t", "");
            string[] group = Regex.Split(results, "</a>");
            return group;
        }

        //读取表[条件ジャンル]
        private void ReadCondition()
        {
            using ( OleDbConnection con = new OleDbConnection("Provider=Microsoft.Jet.OLEDB.4.0;Data Source=" + Application.StartupPath + @"\A23：施設情報20130712.mdb") )
            {
                string sqlStr = "SELECT [施設1],[施設2],[施設3],[施設4] FROM [条件ジャンル] ";
                OleDbDataAdapter adapter = new OleDbDataAdapter(sqlStr, con);
                con.Open();
                adapter.Fill(DtCondition);
            }
        }

        //读取TXT中的数据存入DATATABLE中
        public DataTable ReadUrlTxt()
        {
            DataTable dtLastUrl = new DataTable("Url");
            dtLastUrl.Columns.Add();

            using ( StreamReader reader = new StreamReader(Application.StartupPath + @"\LastGenList.txt") )
            {
                while ( !reader.EndOfStream )
                {
                    DataRow drNew = dtLastUrl.NewRow();
                    drNew[0] = reader.ReadLine();
                    dtLastUrl.Rows.Add(drNew);
                }
            }
            return dtLastUrl;
        }

        public void CatchArea()
        {
            DataTable lastUrl = ReadUrlTxt();
            foreach ( DataRow row in lastUrl.Rows )
            {
                int count = GetWebData(row[0].ToString());

                count = (int)Math.Ceiling(count / 50.0000000);
                for ( int i = 2; i <= count; i++ )
                {
                    string requestUrl = row[0].ToString().Replace("&pg=1", "&pg=" + i.ToString());

                    GetWebData(requestUrl);
                }
            }
        }

        public int GetWebData(string requestUrl)
        {
            string results = webRequest(requestUrl + "&num=50");
            results = results.Substring(results.IndexOf("</button>"));
            results = Regex.Replace(results, @"\n|\r|\t", "").Replace(" ", "");
            //count为检索结果的数目
            string strCount = Regex.Match(results, @"<li.*?>.*?</li>").Value;
            strCount = Regex.Match(strCount, @"[0-9]+").Value;
            int count = Convert.ToInt32(strCount);

            //地址
            string area = Regex.Match(results, @">.*?<h4").Value;
            area = Regex.Replace(area, @"^>|<[^>]*>|>|<h4", "");
            string[] arrayArea = area.Split(';');
            //分类
            string type = Regex.Match(results, @"全ジャンル.*?</div>").Value;
            type = Regex.Replace(type, @"^.*?>|<[^>]*>|>", "");
            string[] arrayType = arrayType = type.Split(';');
            MatchCollection article = Regex.Matches(results, @"<article>.*?</article>");
            using ( OleDbConnection con = new OleDbConnection("Provider=Microsoft.Jet.OLEDB.4.0;Data Source=" + Application.StartupPath + @"\A23：施設情報20130712.mdb") )
            {
                OleDbCommand cmd = con.CreateCommand();
                con.Open();
                foreach ( Match match in article )
                {
                    //设施名称
                    string name = Regex.Match(match.Value, "<h4class=\"clearfix\">.*?</a>").Value;
                    name = Regex.Replace(name, @"<[^>]*>", "");
                    //地址和邮编
                    string zipCode = Regex.Match(match.Value, @"〒.*?<").Value;
                    string address = zipCode.Substring(10).Replace("<", "");
                    zipCode = zipCode.Substring(0, 10).Trim();

                    //地图地址
                    string mapUrl = Regex.Match(match.Value, address + @"<.*?>地図・ナビ").Value;
                    mapUrl = "http://itp.ne.jp" + Regex.Match(mapUrl, "\".*?\"").Value.Replace("\"", "");
                    string mapResult = webRequest(mapUrl);
                    //mapResult = mapResult.Substring(mapResult.IndexOf("<!--** 地図表示 START **-->"));
                    //mapResult = mapResult.Remove(mapResult.IndexOf(" <!--** 地図表示 END **-->"));
                    //经度
                    string longitude = Regex.Match(mapResult, "<input.*?/>").Value;
                    //纬度
                    string latitude = Regex.Match(mapResult, "<input.*?/>").NextMatch().Value;

                    longitude = Regex.Match(longitude, "value=\".*?\"").Value.Replace("value=", "").Replace("\"", "");
                    latitude = Regex.Match(latitude, "value=\".*?\"").Value.Replace("value=", "").Replace("\"", "");
                    if ( longitude == "" )
                    {  
                        mapResult = mapResult.Substring(mapResult.IndexOf("<input"));
                        mapResult = mapResult.Remove(mapResult.IndexOf("</script>"));
                        mapResult = Regex.Match(mapResult, "<script.*?>").Value;
                        mapUrl = Regex.Match(mapResult, "\".*?\"").NextMatch().Value.Replace("\"", "");
                        mapResult = webRequest(mapUrl);
                        longitude = Regex.Match(mapResult, @"val\(.*?\)").Value.Replace("val(\"", "").Replace("\")", "");
                        latitude = Regex.Match(mapResult, @"val\(.*?\)").NextMatch().Value.Replace("val(\"", "").Replace("\")", "");
                    }

                    //电话
                    string tel = Regex.Match(match.Value, @"TEL.*?</p>").Value.Replace("TEL</span>", "").Replace("</p>", "");
                    tel = Regex.Replace(tel, "<[^>]*>", "");
                    
                    string telNo = Regex.Match(tel, "[0-9].*").Value;
                    tel = Regex.Replace(tel, "[0-9].*", "");

                    //链接
                    string url = Regex.Match(match.Value, @"URL.*?</br>").Value.Replace("URL</span>", "").Replace("</br>", "");
                    //主页
                    string homePage = Regex.Match(match.Value, @"<.*?>HP").Value;
                    if ( homePage != "" )
                    {
                        homePage = "http://itp.ne.jp" + Regex.Match(homePage, "\".*?\"").Value.Replace("\"", "").Replace("amp;", "");
                    }
                    if ( arrayType.Length >= 4 )
                    {
                        cmd.CommandText = "INSERT INTO [施設情報] (住所1,住所2,住所3,ジャンル1,ジャンル2,ジャンル3,ジャンル4,施設名称,郵便No,住所,TEL1記号,TEL1No,経度,緯度,公式URL,URL)"
                        + " VALUES ('" + arrayArea[0] + "','" + arrayArea[1] + "','" + arrayArea[2] + "','" + arrayType[0] + "','" + arrayType[1] + "','" + arrayType[2]
                        + "','" + arrayType[3] + "','" + name + "','" + zipCode + "','" + address + "','" + tel + "','" + telNo + "'," + Convert.ToInt32(longitude) + "," + Convert.ToInt32(latitude)
                        + ",'" + homePage + "','" + url + "')";
                    }
                    else if ( arrayType.Length == 3 )
                    {
                        cmd.CommandText = "INSERT INTO [施設情報] (住所1,住所2,住所3,ジャンル1,ジャンル2,ジャンル3,ジャンル4,施設名称,郵便No,住所,TEL1記号,TEL1No,経度,緯度,公式URL,URL)"
                       + " VALUES ('" + arrayArea[0] + "','" + arrayArea[1] + "','" + arrayArea[2] + "','" + arrayType[0] + "','" + arrayType[1] + "','" + arrayType[2]
                       + "','" + "','" + name + "','" + zipCode + "','" + address + "','" + tel + "','" + telNo + "'," + Convert.ToInt32(longitude) + "," + Convert.ToInt32(latitude)
                       + ",'" + homePage + "','" + url + "')";
                    }
                    else if ( arrayType.Length == 2 )
                    {
                        cmd.CommandText = "INSERT INTO [施設情報] (住所1,住所2,住所3,ジャンル1,ジャンル2,ジャンル3,ジャンル4,施設名称,郵便No,住所,TEL1記号,TEL1No,経度,緯度,公式URL,URL)"
                       + " VALUES ('" + arrayArea[0] + "','" + arrayArea[1] + "','" + arrayArea[2] + "','" + arrayType[0] + "','" + arrayType[1] + "','"
                       + "','" + "','" + name + "','" + zipCode + "','" + address + "','" + tel + "','" + telNo + "'," + Convert.ToInt32(longitude) + "," + Convert.ToInt32(latitude)
                       + ",'" + homePage + "','" + url + "')";
                    }
                    else if ( arrayType.Length == 1 )
                    {
                        cmd.CommandText = "INSERT INTO [施設情報] (住所1,住所2,住所3,ジャンル1,ジャンル2,ジャンル3,ジャンル4,施設名称,郵便No,住所,TEL1記号,TEL1No,経度,緯度,公式URL,URL)"
                       + " VALUES ('" + arrayArea[0] + "','" + arrayArea[1] + "','" + arrayArea[2] + "','" + arrayType[0] + "','" + "','"
                       + "','" + "','" + name + "','" + zipCode + "','" + address + "','" + tel + "','" + telNo + "'," + Convert.ToInt32(longitude) + "," + Convert.ToInt32(latitude)
                       + ",'" + homePage + "','" + url + "')";
                    }

                    cmd.ExecuteNonQuery();
                }
            }
            return count;
        }
    }
}
william11zhu
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
8、多层次网页数据抓取

using System;using System.Collections.Generic;using System.ComponentModel;using System.Data;using System.Drawing;using System.Text;using System.Windows.Forms;using System.Net;using System.IO;
复制链接

扫一扫