废话
正则表达式就是个坑,学了不常用就忘光了,可是编码过程中万一遇上就是一个大坑,偷偷喜欢一个姑娘
上代码
主体方法之外的可以单独建几个类,懒得建了
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.Services;
using System.Collections;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;
using System.Text;
/// <summary>
///zzkPaChong 的摘要说明
/// </summary>
[WebService(Namespace = "http://tempuri.org/")]
[WebServiceBinding(ConformsTo = WsiProfiles.BasicProfile1_1)]
//若要允许使用 ASP.NET AJAX 从脚本中调用此 Web 服务,请取消对下行的注释。
// [System.Web.Script.Services.ScriptService]
public class zzkPaChong : System.Web.Services.WebService {
public zzkPaChong () {
//如果使用设计的组件,请取消注释以下行
//InitializeComponent();
}
[WebMethod]
public string HelloWorld() {
return "Hello World";
}
[WebMethod]
public string zzkPaChongTest()
{
tbt_sps();
return "{\"status\":1, \"msg\":\"调用完成!\"}";
}
# region 主体方法
/// <summary>
/// 抓取gdtbt.org.cn的信息
/// </summary>
public void tbt_sps()
{
Model.wsn_tbt_sps mwts;
BLL.wsn_tbt_sps bwts = new BLL.wsn_tbt_sps();
Dictionary<string, string> listtitA = new Dictionary<string, string>();
listtitA.Add("WTO/TBT", "http://gdtbt.org.cn/tbt.aspx");
listtitA.Add("WTO/SPS", "http://gdtbt.org.cn/sps.aspx");
foreach (var item in listtitA)
{
List<Hashtable> htt = ListHtml(item.Value);
//string ziUrl = HrefHtml("http://gdtbt.org.cn/", sdptAzzktbt(itemTr));
if (htt.Count > 0)
{
foreach (Hashtable htItem in htt)
{
if (htItem["status"]!=null&&htItem["status"].ToString()=="1")
{
//查看本地数据库是否存在此条信息避免重复抓取插入
if (1 == 1)
//if (bwts.GetModelList(" TBNumber='" + htt["TBNumber"].ToString() + "' ").Count <= 0)
{
string ziUrl = "http://gdtbt.org.cn/" + htItem["ziurl"].ToString();
mwts = new Model.wsn_tbt_sps();
if (htItem["TBTitle"] != null)
{
mwts.TBTitle = htItem["TBTitle"].ToString();
}
if (htItem["TBChengYuan"] != null)
{
mwts.TBChengYuan = htItem["TBChengYuan"].ToString();
}
if (htItem["TBDate"] != null)
{
mwts.TBDate = Convert.ToDateTime(htItem["TBDate"]);
}
if (htItem["TBNumber"] != null)
{
mwts.TBNumber = htItem["TBNumber"].ToString();
if (mwts.TBNumber.IndexOf("TBT") >= 0)
{
mwts.TBType = "TBT";
}
else
{
mwts.TBType = "SPS";
}
}
string bdboy = details(ziUrl);
mwts.TBbody = bdboy;
mwts.zq_Time = DateTime.Now;
mwts.state = 0;
//int bolid = bwts.Add(mwts);
}
}
}
}
}
}
#endregion
# region 抓取网页标签方法
/// <summary>
/// 获取列表页信息
/// </summary>
/// <param name="url">url带http://</param>
/// <returns></returns>
public List<Hashtable> ListHtml(string url)
{
List<Hashtable> htt = new List<Hashtable>();
Hashtable ht = new Hashtable();
try
{
string htmlcode = GetHTML(url);
Regex reg1 = new Regex(@"<table[^>]*(class=""tablelist"")[^>]*>[\s\S]*?</table>");
MatchCollection ms1 = reg1.Matches(htmlcode);
string b = ms1[0].Groups[0].Value.ToString();
Regex reg2 = new Regex(@"<tr[^>]*>(?<ww>[\s\S]*?)</tr>");
MatchCollection ms2 = reg2.Matches(b);
foreach (Match m2 in ms2)
{
string html = m2.Groups["ww"].Value.ToString().Trim();
Regex reg3 = new Regex(@"<td[^>]*>(?<ww>[\s\S]*?)</td>");
MatchCollection ms3 = reg3.Matches(html);
if (ms3.Count > 0)
{
ht = new Hashtable();
ht.Add("status", "1");
for (int i = 0; i < ms3.Count; i++)
{
//此处可加判断加一个关键字查询
if (i == 0)
{
string a = ms3[i].Groups["ww"].Value.ToString();
Regex reg4 = new Regex(@"(?is)<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*class=""tbh"">");
MatchCollection ms4 = reg4.Matches(a);
foreach (Match m4 in ms4)
{
ht.Add("ziurl", m4.Groups["href"].Value.ToString().Trim());
}
Regex reg5 = new Regex(@"(?is)<a[^>]*class=""tbh"">(?<ww>[\s\S]*?)</a>");
MatchCollection ms5 = reg5.Matches(a);
foreach (Match m5 in ms5)
{
ht.Add("TBNumber", m5.Groups["ww"].Value.ToString().Trim());
}
}
if (i == 1)
{
string a = ms3[i].Groups["ww"].Value.ToString();
Regex reg4 = new Regex(@"(?is)<a[^>]*class=""tbbt"">(?<ww>[\s\S]*?)</a>");
MatchCollection ms4 = reg4.Matches(a);
foreach (Match m4 in ms4)
{
ht.Add("TBTitle", m4.Groups["ww"].Value.ToString().Trim());
}
}
if (i == 2)
{
string a = ms3[i].Groups["ww"].Value.ToString();
ht.Add("TBChengYuan", a.Trim());
}
if (i == 3)
{
string a = ms3[i].Groups["ww"].Value.ToString();
ht.Add("TBDate", a.Trim());
}
}
htt.Add(ht);
}
}
}
catch (Exception ex)
{
//ht.Remove("status");
ht = new Hashtable();
ht.Add("status", "0");
ht.Add("msg",ex.Message);
htt.Add(ht);
}
return htt;
}
/// <summary>
/// 获取详情页信息
/// </summary>
/// <param name="url">url带http://</param>
/// <returns></returns>
public string details(string url)
{
string zong = "";
string htmlcode = GetHTML(url);
Regex reg = new Regex(@"(?is)<div class=""tb1[^>]*>(?><div[^>]*>(?<o>)|</div>(?<-o>)|(?:(?!</?div\b).)*)*(?(o)(?!))</div>");
MatchCollection ms = reg.Matches(htmlcode);
if (ms.Count > 0)
{
zong += ms[0].Value.ToString().Trim();
}
return zong;
}
#endregion
#region 工具方法
/// <summary>
/// 获取utf-8
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
private string GetHTML(string url)
{
string rt = "";
try
{
WebRequest Wrq = WebRequest.Create(url);//URL
WebResponse Wrs = Wrq.GetResponse();
Stream strm = Wrs.GetResponseStream();
StreamReader sr = new StreamReader(strm, System.Text.Encoding.GetEncoding("UTF-8"));
rt = sr.ReadToEnd();
Wrs.Close();
strm.Close();
sr.Close();
}
catch
{
rt = "1";
}
return rt;
}
/// <summary>
/// 获取gb2312源代码
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
private string GetHTML2312(string url)
{
string rt = "";
try
{
WebRequest Wrq = WebRequest.Create(url);//URL
WebResponse Wrs = Wrq.GetResponse();
Stream strm = Wrs.GetResponseStream();
StreamReader sr = new StreamReader(strm, System.Text.Encoding.GetEncoding("gb2312"));
rt = sr.ReadToEnd();
Wrs.Close();
strm.Close();
sr.Close();
}
catch
{
rt = "1";
}
return rt;
}
//部分链接用上边的获取不到源代码,可以用这个试一试
public string GetHtmlwc(string url)
{
WebClient client = new WebClient();
client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)"); // Add a user agent header in case the // requested URI contains a query.
Stream data = client.OpenRead(url);
StreamReader reader = new StreamReader(data, Encoding.UTF8);
string s = reader.ReadToEnd();
data.Close();
reader.Close();
return s;
}
#endregion
}