using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Threading;
using System.Data.OleDb;
namespace Institute
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private DataTable dtCondition = new DataTable("Condition");
public DataTable DtCondition
{
set { dtCondition = value; }
get { return dtCondition; }
}
private void btnStart_Click(object sender, EventArgs e)
{
ReadCondition();
CatchPage("http://itp.ne.jp");
MessageBox.Show("URL获取完毕!");
}
private void btnGetData_Click(object sender, EventArgs e)
{
CatchArea();
}
#region CatchPage
//获取URL
public void CatchPage(string requestUrl)
{
//获取页面并解析出所需的URL
string[] groupArray = GenreSearch(webRequest(requestUrl));
//dtCondition.Columns.Add("URL");
for ( int i = 0; i < DtCondition.Rows.Count; i++ )
{
int key = 1;
for ( int j = 0; j < groupArray.Length - 1; j++ )
{
string strCondition = Regex.Match(groupArray[j], ">.*?</h2>|>.*").Value.Replace(">", "").Replace("</h2", "").Trim();
if ( ( DtCondition.Rows[i][0].ToString() ) == strCondition )
{
MatchCollection request = Regex.Matches(groupArray[j], "\".*?\"");
if ( DtCondition.Rows[i][key].ToString() != "" )
{
CatchChildPage("http://itp.ne.jp" + request[0].Value, key++, i);
}
else
{
CatchChildPage("http://itp.ne.jp" + request[0].Value, "LastGenList.txt");
}
break;
}
}
}
}
#endregion
#region CatchChildPage_1
//有下级条件
public void CatchChildPage(string requestUrl, int key, int i)
{
requestUrl = requestUrl.Replace("\"", "");
string[] groupArray = GetHref(webRequest(requestUrl));
//DataTable dtLastUrl = null;
for ( int j = 1; j < groupArray.Length; j++ )
{
string strCondition = Regex.Match(groupArray[j], @">.*?<").Value;
strCondition = strCondition.Substring(1, strCondition.IndexOf('(') - 1).Trim();
if ( strCondition.Equals(DtCondition.Rows[i][key].ToString()) && key < 4 )
{
MatchCollection request = Regex.Matches(groupArray[j], "\".*?\"");
if ( DtCondition.Rows[i][++key].ToString() != "" )
{
//调用CatchChildPage_1
CatchChildPage("http://itp.ne.jp" + request[1].Value, key, i);
}
else
{
//调用CatchChildPage_2
CatchChildPage("http://itp.ne.jp" + request[0].Value);
}
break;
}
}
}
#endregion
#region CatchChildPage_2
//没有下级条件
public void CatchChildPage(string requestUrl)
{
requestUrl = requestUrl.Replace("\"", "");
string[] groupArray = GetHref(webRequest(requestUrl));
//DataTable dtLastUrl = new DataTable("LastUrl");
//dtLastUrl.Columns.Add();
for ( int j = 1; j < groupArray.Length; j++ )
{
MatchCollection request = Regex.Matches(groupArray[j], "\".*?\"");
string strCount = Regex.Match(groupArray[j], @"\(.*?\)").Value;
//匹配到1一个以上的URL说明还有下一级,继续向下匹配
if ( request.Count > 1 )
{
CatchChildPage("http://itp.ne.jp" + request[1].Value);
}
//匹配到1个URL且没有下一级
else if ( request.Count == 1 && strCount != "(0件)" )
{
//调用CatchChildPage_3
CatchChildPage("http://itp.ne.jp" + request[0].Value, "LastGenList.txt");
}
}
}
#endregion
#region CatchChildPage_3
//取都省府县各级的URL
public void CatchChildPage(string requestUrl, string txt)
{
requestUrl = requestUrl.Replace("\"", "");
string[] groupArray = GetHref(webRequest(requestUrl));
for ( int j = 1; j < groupArray.Length; j++ )
{
MatchCollection request = Regex.Matches(groupArray[j], "\".*?\"");
string strCount = Regex.Match(groupArray[j], @"\(.*?\)").Value;
//匹配到1一个以上的URL说明还有下一级,继续向下匹配
if ( request.Count > 1 )
{
CatchChildPage("http://itp.ne.jp" + request[1].Value, txt);
}
//匹配到1个URL且没有下一级
else if ( request.Count == 1 && strCount != "(0件)" )
{
using ( StreamWriter writer = new StreamWriter(Application.StartupPath + @"\" + txt, true) )
{
writer.WriteLine("http://itp.ne.jp" + request[0].Value.Replace("\"", "").Replace("amp;", ""));
}
}
}
}
#endregion
#region webRequest
//获取requestUrl所指向的页面
public string webRequest(string requestUrl)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
string results = null;
try
{
request = (HttpWebRequest)WebRequest.Create(requestUrl);
//代理设置
request.Proxy = new WebProxy("http://66.35.68.145:3128");
request.Method = "GET";
request.Timeout = 1000 * 60 * 2;
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
request.KeepAlive = true;
request.ReadWriteTimeout = 1000 * 60 * 2;
response = (HttpWebResponse)request.GetResponse();
using ( StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("Shift_JIS")) )
{
results = reader.ReadToEnd();
}
}
catch
{
//如果捕获异常,则延迟1秒
Thread.Sleep(1000);
}
finally
{
try
{
response.Close();
}
catch
{
//捕获到response异常,则重新执行GetGenre函数(一般response异常为response=null,既连接异常)
results = webRequest(requestUrl);
}
}
//如果results的值为空则重新执行GetGenre函数(此种情况一般发生在连接断开时间比较长的情况下)
if ( results == null )
{
results = webRequest(requestUrl);
}
return results;
}
#endregion
#region GetHref
//从获取的页面中解析出所需的数据
public string[] GetHref(string results)
{
results = results.Substring(results.IndexOf("<li>"));
results = results.Remove(results.IndexOf("</ul>"));
results = Regex.Replace(results, "<(?!a|/a|li).*?>|\r|\n|\t", "");
results = Regex.Replace(results, @"<input[^>]*/?>", "");
string[] group = Regex.Split(results, "<li>");
return group;
}
#endregion
//解析主页中的数据
public string[] GenreSearch(string results)
{
results = results.Substring(results.IndexOf("<h1>ジャンルから探す</h1>"));
results = results.Remove(results.IndexOf("</div>")).Replace(" ", "").Replace("<br>", "").Replace("\n", "");
results = Regex.Replace(results, @"<span>.*?</span>", "", RegexOptions.IgnoreCase);
results = Regex.Replace(results, @"<(?!a|/a|/h2).*?>|\r|\n|\t", "");
string[] group = Regex.Split(results, "</a>");
return group;
}
//读取表[条件ジャンル]
private void ReadCondition()
{
using ( OleDbConnection con = new OleDbConnection("Provider=Microsoft.Jet.OLEDB.4.0;Data Source=" + Application.StartupPath + @"\A23:施設情報20130712.mdb") )
{
string sqlStr = "SELECT [施設1],[施設2],[施設3],[施設4] FROM [条件ジャンル] ";
OleDbDataAdapter adapter = new OleDbDataAdapter(sqlStr, con);
con.Open();
adapter.Fill(DtCondition);
}
}
//读取TXT中的数据存入DATATABLE中
public DataTable ReadUrlTxt()
{
DataTable dtLastUrl = new DataTable("Url");
dtLastUrl.Columns.Add();
using ( StreamReader reader = new StreamReader(Application.StartupPath + @"\LastGenList.txt") )
{
while ( !reader.EndOfStream )
{
DataRow drNew = dtLastUrl.NewRow();
drNew[0] = reader.ReadLine();
dtLastUrl.Rows.Add(drNew);
}
}
return dtLastUrl;
}
public void CatchArea()
{
DataTable lastUrl = ReadUrlTxt();
foreach ( DataRow row in lastUrl.Rows )
{
int count = GetWebData(row[0].ToString());
count = (int)Math.Ceiling(count / 50.0000000);
for ( int i = 2; i <= count; i++ )
{
string requestUrl = row[0].ToString().Replace("&pg=1", "&pg=" + i.ToString());
GetWebData(requestUrl);
}
}
}
public int GetWebData(string requestUrl)
{
string results = webRequest(requestUrl + "&num=50");
results = results.Substring(results.IndexOf("</button>"));
results = Regex.Replace(results, @"\n|\r|\t", "").Replace(" ", "");
//count为检索结果的数目
string strCount = Regex.Match(results, @"<li.*?>.*?</li>").Value;
strCount = Regex.Match(strCount, @"[0-9]+").Value;
int count = Convert.ToInt32(strCount);
//地址
string area = Regex.Match(results, @">.*?<h4").Value;
area = Regex.Replace(area, @"^>|<[^>]*>|>|<h4", "");
string[] arrayArea = area.Split(';');
//分类
string type = Regex.Match(results, @"全ジャンル.*?</div>").Value;
type = Regex.Replace(type, @"^.*?>|<[^>]*>|>", "");
string[] arrayType = arrayType = type.Split(';');
MatchCollection article = Regex.Matches(results, @"<article>.*?</article>");
using ( OleDbConnection con = new OleDbConnection("Provider=Microsoft.Jet.OLEDB.4.0;Data Source=" + Application.StartupPath + @"\A23:施設情報20130712.mdb") )
{
OleDbCommand cmd = con.CreateCommand();
con.Open();
foreach ( Match match in article )
{
//设施名称
string name = Regex.Match(match.Value, "<h4class=\"clearfix\">.*?</a>").Value;
name = Regex.Replace(name, @"<[^>]*>", "");
//地址和邮编
string zipCode = Regex.Match(match.Value, @"〒.*?<").Value;
string address = zipCode.Substring(10).Replace("<", "");
zipCode = zipCode.Substring(0, 10).Trim();
//地图地址
string mapUrl = Regex.Match(match.Value, address + @"<.*?>地図・ナビ").Value;
mapUrl = "http://itp.ne.jp" + Regex.Match(mapUrl, "\".*?\"").Value.Replace("\"", "");
string mapResult = webRequest(mapUrl);
//mapResult = mapResult.Substring(mapResult.IndexOf("<!--** 地図表示 START **-->"));
//mapResult = mapResult.Remove(mapResult.IndexOf(" <!--** 地図表示 END **-->"));
//经度
string longitude = Regex.Match(mapResult, "<input.*?/>").Value;
//纬度
string latitude = Regex.Match(mapResult, "<input.*?/>").NextMatch().Value;
longitude = Regex.Match(longitude, "value=\".*?\"").Value.Replace("value=", "").Replace("\"", "");
latitude = Regex.Match(latitude, "value=\".*?\"").Value.Replace("value=", "").Replace("\"", "");
if ( longitude == "" )
{
mapResult = mapResult.Substring(mapResult.IndexOf("<input"));
mapResult = mapResult.Remove(mapResult.IndexOf("</script>"));
mapResult = Regex.Match(mapResult, "<script.*?>").Value;
mapUrl = Regex.Match(mapResult, "\".*?\"").NextMatch().Value.Replace("\"", "");
mapResult = webRequest(mapUrl);
longitude = Regex.Match(mapResult, @"val\(.*?\)").Value.Replace("val(\"", "").Replace("\")", "");
latitude = Regex.Match(mapResult, @"val\(.*?\)").NextMatch().Value.Replace("val(\"", "").Replace("\")", "");
}
//电话
string tel = Regex.Match(match.Value, @"TEL.*?</p>").Value.Replace("TEL</span>", "").Replace("</p>", "");
tel = Regex.Replace(tel, "<[^>]*>", "");
string telNo = Regex.Match(tel, "[0-9].*").Value;
tel = Regex.Replace(tel, "[0-9].*", "");
//链接
string url = Regex.Match(match.Value, @"URL.*?</br>").Value.Replace("URL</span>", "").Replace("</br>", "");
//主页
string homePage = Regex.Match(match.Value, @"<.*?>HP").Value;
if ( homePage != "" )
{
homePage = "http://itp.ne.jp" + Regex.Match(homePage, "\".*?\"").Value.Replace("\"", "").Replace("amp;", "");
}
if ( arrayType.Length >= 4 )
{
cmd.CommandText = "INSERT INTO [施設情報] (住所1,住所2,住所3,ジャンル1,ジャンル2,ジャンル3,ジャンル4,施設名称,郵便No,住所,TEL1記号,TEL1No,経度,緯度,公式URL,URL)"
+ " VALUES ('" + arrayArea[0] + "','" + arrayArea[1] + "','" + arrayArea[2] + "','" + arrayType[0] + "','" + arrayType[1] + "','" + arrayType[2]
+ "','" + arrayType[3] + "','" + name + "','" + zipCode + "','" + address + "','" + tel + "','" + telNo + "'," + Convert.ToInt32(longitude) + "," + Convert.ToInt32(latitude)
+ ",'" + homePage + "','" + url + "')";
}
else if ( arrayType.Length == 3 )
{
cmd.CommandText = "INSERT INTO [施設情報] (住所1,住所2,住所3,ジャンル1,ジャンル2,ジャンル3,ジャンル4,施設名称,郵便No,住所,TEL1記号,TEL1No,経度,緯度,公式URL,URL)"
+ " VALUES ('" + arrayArea[0] + "','" + arrayArea[1] + "','" + arrayArea[2] + "','" + arrayType[0] + "','" + arrayType[1] + "','" + arrayType[2]
+ "','" + "','" + name + "','" + zipCode + "','" + address + "','" + tel + "','" + telNo + "'," + Convert.ToInt32(longitude) + "," + Convert.ToInt32(latitude)
+ ",'" + homePage + "','" + url + "')";
}
else if ( arrayType.Length == 2 )
{
cmd.CommandText = "INSERT INTO [施設情報] (住所1,住所2,住所3,ジャンル1,ジャンル2,ジャンル3,ジャンル4,施設名称,郵便No,住所,TEL1記号,TEL1No,経度,緯度,公式URL,URL)"
+ " VALUES ('" + arrayArea[0] + "','" + arrayArea[1] + "','" + arrayArea[2] + "','" + arrayType[0] + "','" + arrayType[1] + "','"
+ "','" + "','" + name + "','" + zipCode + "','" + address + "','" + tel + "','" + telNo + "'," + Convert.ToInt32(longitude) + "," + Convert.ToInt32(latitude)
+ ",'" + homePage + "','" + url + "')";
}
else if ( arrayType.Length == 1 )
{
cmd.CommandText = "INSERT INTO [施設情報] (住所1,住所2,住所3,ジャンル1,ジャンル2,ジャンル3,ジャンル4,施設名称,郵便No,住所,TEL1記号,TEL1No,経度,緯度,公式URL,URL)"
+ " VALUES ('" + arrayArea[0] + "','" + arrayArea[1] + "','" + arrayArea[2] + "','" + arrayType[0] + "','" + "','"
+ "','" + "','" + name + "','" + zipCode + "','" + address + "','" + tel + "','" + telNo + "'," + Convert.ToInt32(longitude) + "," + Convert.ToInt32(latitude)
+ ",'" + homePage + "','" + url + "')";
}
cmd.ExecuteNonQuery();
}
}
return count;
}
}
}
8、多层次网页数据抓取
最新推荐文章于 2023-01-29 09:49:04 发布