C#正则表达式 解析html+table tr td 内容

aspx页面获取方法:
    var tbZHXX = GetWorldexWyHtml(s, @"id=""tbZHXX""", @"class=""GridCommonItem""", "Worldex"); 

提交参数及隐藏hiddle值
 var postUrl = "http://xxm.cn/glj/querydata/xxSearchOld.aspx";
            List<KeyValuePair<String, String>> paramList = new List<KeyValuePair<String, String>>();

            GetViewHiddenData(postUrl).ToList().ForEach(x => paramList.Add(new KeyValuePair<string, string>(x.Key, x.Value)));
            paramList = (from p in paramList where !string.IsNullOrEmpty(p.Value) select p).ToList();
            paramList.Add(new KeyValuePair<string, string>("txtBillNo", strBlNo));
            paramList.Add(new KeyValuePair<string, string>("btnSearch", "查询"));
 var s = HttpAspxPostMathHtml(postUrl, paramList); 

/// <summary>
		/// 
		/// </summary>
		/// <param name="regexInfo">解析html内容</param>
		/// <param name="regexParm">table class或者id</param>
		/// <param name="classParm">tr class或者id</param>
		/// <param name="companyCode"></param>
		/// <returns></returns>
		public static List<WWyDetails> GetWWyHtml(string regexInfo, string regexParm, string classParm, string companyCode)
		{
			List<WWyDetails> resultWdHtml = new List<WWyDetails>();

			WWyDetails wd = new WWyDetails();
			var rex = "(?is)(?<=<table[^>]*?" + regexParm + "[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?" + classParm + "[^>]*?>(?:\\s*<td[^>]*>(.*?)</td>)*\\s*</tr>";
			Regex reg = new Regex(rex);
			var td = new List<string>();
			foreach (Match m in reg.Matches(regexInfo).Cast<Match>())
			{
				if (companyCode == "Worldex") //港联捷
				{
					//查找每个TD的内容								(\s+scope=[^>]+)? td 后面跟着 class align等		 \r\n|\s+ \r\n\s+处理TD内容后面有换行
					td = Regex.Matches(m.Value, @"(?<=<td(\s+scope=[^>]+)?>)\r\n\s+.*?\r\n\s+(?=</td>)")// @"(?<=<td>)\r\n\s+[\s\S]*?\r\n\s+(?=</td>)")
							 .Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();
				}
				else
				{
					//<font color="#333333">SNL7QDJL510757</font>
					//td = Regex.Matches(m.Value, @"(?<=<td>).*?(?=</td>)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();
					td = Regex.Matches(m.Value, @"(?<=<font(\s+color=[^>]+)?>).*?(?=</font>)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();
				}
				resultWdHtml.Add(new WWyDetails() { WSinotrans = td.ToList() });
			}
			return resultWdHtml;
		}

		public class WWyDetails
		{
			public List<string> WSinotrans { get; set; }
		}



获取asp页面及解析使用方法:   
var sPuci = HttpAspPostMathHtml("http://xxx/index_dt_container.asp", "search=true&companyname=&companycode=&container_no=&bill_no=" + strBlNo + "&btn3.x=39&btn3.y=15", "gb2312");

var gdvContainer = GetSYDetails(sPuci, @"class=tableGrid", @"class=gridHeader", "铅封号", "YTWY"); 

		/// <summary>
		/// 获取不同Table 中内容
		/// </summary>
		/// <param name="regexInfo">解析内容</param>
		/// <param name="classTable">table class或者id</param>
		/// <param name="classParm">tr 中class 或者id</param>
		/// <param name="compareInfo">进行提取对比的关键字</param>
		/// <param name="companyCode">对比的公司名</param>
		/// <returns></returns>
	public static List<SYDetails> GetSYDetails(string regexInfo, string classTable, string classParm, string compareInfo, string companyCode)
		{                   //<table.*? class=grid[^>]*?>[\s\S]*?<\/table> 匹配所有table                                                                          //tr[^>]*?
			Regex regTable = new Regex(@"<table.*?" + classTable + "[^>]*?>[\\s\\S]*?<\\/table>"); //@"(?is)(?<=<table[^>]*?class=grid[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\s*<td[^>]*>(.*?)</td>)*\s*</tr>");
																																														 // new Regex(@"(?is)(?<=<table[^>]*?class=grid[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\s*<td[^>]*>(.*?)</td>)*\s*</tr>");
			List<SYDetails> lstSTX = new List<SYDetails>();
			var td = new List<string>();
			foreach (Match mTable in regTable.Matches(regexInfo).Cast<Match>())
			{
				//进行每个table里面关键标题对比是否存在
				var compare = Regex.Match(mTable.Value, "(?is)<tr " + classParm + ">(?:\\s*<td[^>]*>(.*?)</td>)*\\s*((?!</tr>).)*").Groups[0].Value.Trim();//.Groups[1].Value.Trim();
				if (compare.Contains(compareInfo))
				{
					//解析table 里面包含多少个tr
					Regex regTr = new Regex(@"(?is)(?<=<table[^>]*?" + classTable + "[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\\s*<td[^>]*>(.*?)</td>)*\\s*</tr>");
					foreach (Match mTr in regTr.Matches(mTable.Value).Cast<Match>().Skip(1))  //Skip(1跳过tr 标题列
					{
						if (companyCode == "YTWY") //烟台外运国际码头
						{
							if (!mTr.Value.Contains("查询数据为空。"))
							{
								//查找每个TD的内容 包含td 后面 class等					 
								td = Regex.Matches(mTr.Value, @"(?is)(?<=<td(\s+align=[^>]+)?>).*?(?=\s*</td)")
										.Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();
							}
						}
						else
						{
							//查找每个TD的内容 包含td 后面 class等					 
							td = Regex.Matches(mTr.Value, @"(?is)(?<=<td(\s+class=[^>]+)?>).*?(?=\s*</td)")
									.Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();
						}
						lstSTX.Add(new SYDetails() { SYTwy = td });
					}
					break;
				}
			}
			return lstSTX;
		}
		public class SYDetails
		{
			public List<string> SYTwy { get; set; }
		}


#region ASP/ASPX页面  Get/Post获取返回数据
		/// <summary>
		/// ASP 页面POST请求与获取结果
		/// </summary>
		/// <param name="Url">posturl</param>
		/// <param name="postDataStr">post参数</param>
		/// <param name="encoding">页面编码</param>
		/// <returns></returns>
		public static string HttpAspPostMathHtml(string Url, string postDataStr, string encoding)
		{
			HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
			request.Method = "POST";
			request.Accept = "text/html, application/xhtml+xml, image/jxr, */*";
			request.ContentType = "application/x-www-form-urlencoded";
			request.KeepAlive = true;
			request.Headers.Add("Accept-Language", "zh-Hans-CN,zh-Hans;q=0.7,ja;q=0.3");
			request.Headers.Add("Accept-Encoding", "gzip, deflate");
			request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko";
			byte[] bytes = System.Text.Encoding.Default.GetBytes(postDataStr);
			request.ContentLength = bytes.Length;
			Stream stream = request.GetRequestStream();
			stream.Write(bytes, 0, bytes.Length);
			stream.Close();//以上是POST数据的写入

			HttpWebResponse response = (HttpWebResponse)request.GetResponse();
			var retString = string.Empty;
			using (Stream responsestream = response.GetResponseStream())
			{
				using (StreamReader sr = new StreamReader(responsestream, System.Text.Encoding.GetEncoding(encoding)))
				{
					retString = sr.ReadToEnd();
				}
			} //直接获取body内容
			var resultStr = Regex.Matches(DelHTML(retString), @"(?is)<body[^>]*?>([\s\S].*?)</body>") 
											.Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();
			return resultStr[0].ToString();
		}

		/// <summary>
		/// ASP 页面Get请求与获取结果
		/// </summary>
		/// <param name="Url">posturl</param>
		/// <param name="postDataStr">post参数</param>
		/// <param name="encoding">页面编码</param>
		/// <returns></returns>
		public static string HttpAspGetMathHtml(string Url, string postDataStr, string encoding)
		{
			HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + "?" + postDataStr);
			request.Method = "Get";
			request.Accept = "text/html, application/xhtml+xml, image/jxr, */*";
			request.KeepAlive = true;
			request.Headers.Add("Accept-Language", "zh-Hans-CN,zh-Hans;q=0.7,ja;q=0.3");
			request.Headers.Add("Accept-Encoding", "gzip, deflate");
			request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko";

			HttpWebResponse response = (HttpWebResponse)request.GetResponse();
			var retString = string.Empty;
			using (Stream responsestream = response.GetResponseStream())
			{
				using (StreamReader sr = new StreamReader(responsestream, System.Text.Encoding.GetEncoding(encoding)))
				{
					retString = sr.ReadToEnd();
				}
			}
			return retString;
		}

		/// <summary>
		/// 获取ASPX页面中隐藏post值	Viewstae 等 进行post提交
		/// </summary>
		/// <param name="Url"></param>
		/// <returns></returns>
		public Dictionary<string, string> GetViewHiddenData(string Url)
		{
			HttpClient httpClient = new HttpClient();
			httpClient.MaxResponseContentBufferSize = 256000;
			httpClient.DefaultRequestHeaders.Add("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36");
			HttpResponseMessage response = httpClient.GetAsync(new Uri(Url)).Result;
			var result = Regex.Matches(response.Content.ReadAsStringAsync().Result, @"<input type=""hidden""[^>]*?.*?\/>")
									.Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();
			Dictionary<string, string> returnHidden = new Dictionary<string, string>();
			foreach (var item in result)
			{
				//获取 隐藏域中的 id  value
				//var reg = @"(?isn)<input((?!([<>]|id=)).)+id=""(?<id>[^""<>]+)""[^<>]*?value=""(?<value>[^<>""]*)""";
				//	var keyvalue = Regex.Match(item, reg);
				//returnHidden.Add(keyvalue.Groups[1].Value, keyvalue.Groups[2].Value);
				var key = Regex.Match(item, @"<input type=""hidden""[^>]*?[^>]+?id=""([\s\S]+?)""[^>]+>").Groups[1].Value;
				var value = Regex.Match(item, @"<input type=""hidden""[^>]*?[^>]+?value=""([\s\S]+?)""[^>]+>").Groups[1].Value;
				returnHidden.Add(key, value);
			}
			//用完要记得释放
			httpClient.Dispose();
			return returnHidden;
		}

		/// <summary>
		/// ASPX页面POST请求与获取结果
		/// </summary>
		/// <param name="Url"></param>
		/// <param name="postDataStr"></param>
		/// <returns></returns>
		public static string HttpAspxPostMathHtml(string Url, List<KeyValuePair<String, String>> postDataStr)
		{
			var retString = string.Empty;
			HttpClient httpClient = new HttpClient();
			HttpResponseMessage response = httpClient.GetAsync(new Uri(Url)).Result;
			response = httpClient.PostAsync(new Uri(Url), new FormUrlEncodedContent(postDataStr)).Result;
			retString = response.Content.ReadAsStringAsync().Result;

			var resultStr = Regex.Matches(DelHTML(retString), @"(?is)<body[^>]*?>([\s\S].*?)</body>")
											.Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();
			//用完要记得释放
			httpClient.Dispose();
			return resultStr[0].ToString();
		}

		#region	将HTML去除一些无用数据
		/// <summary>
		/// //将HTML去除一些无用数据
		/// </summary>
		/// <param name="Htmlstring"></param>
		/// <returns></returns>
		public static string DelHTML(string Htmlstring)
		{
			//删除脚本
			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
			//<input((?< !<).) *? hidden.*?\/>
			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<input type=""hidden""[^>]*?.*?\/>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
			//删除HTML
			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"-->", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<!--.*", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(amp|#38);", "&", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(lt|#60);", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(gt|#62);", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

			//	Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

			Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&#(\d+);", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
			Htmlstring.Replace("\r\n", "");
			return Htmlstring;
		}
		#endregion
		#endregion

展开阅读全文

没有更多推荐了,返回首页