注册了这个久终于想好了写什么了。
最近因公司需要做了网络取数据。
这网上打资料都没有完整的资料特别是验证码登陆。
其实我这都是从网上收集整理出来的的。做的不是太好。
使用的两个WebClient,HttpWebRequest。NET的类分别做出一个静态类和动态类。
静态类GetHTML 支持一般的网页取数据和POET提交,但不能支持验证码,自动识别网页编码也可以手动输入网页编码。不过最好是手动输入那样会让程序少做运行代码。
System.Collections.Specialized.NameValueCollection PostVars =new System.Collections.Specialized.NameValueCollection()
PostVars.Add("uid","name");
PostVars.Add("pwd","123456");
string tmphtml= GetStrHtmlPost(url,PostVars);
动态类PostWeb 支持验证码、验证用户、登陆过会产生COOKIES字符串,第二次运行程序时可通过COOKIES而不用再次登陆。
PostWeb web=new PostWeb();
web.GetCode(验证码地址);
string tmplogin=web.LoginPost("http://www.mystand.com.cn/login/submit.jsp","userid=hgj0000&password=06045369","http://www.mystand.com.cn/");
if(tmplogin.Contains(条件))
{
string cookie= web.cookieHeader;//保存到文件中下次直接付到类就可免登陆
web.GetPage("http://www.mystand.com.cn/", "http://www.mystand.com.cn/");
}
PostWeb web=new PostWeb();
web.cookieHeader=cookie;//把保存文件中的cookie付到类中
web.GetPage("http://www.mystand.com.cn/", "http://www.mystand.com.cn/");
using System;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
using System.IO;
using System.IO.Compression;
namespace Manager.Net.Html
{
/// <summary>
/// HTML相关
/// </summary>
public class CHtml
{
public CHtml()
{
}
~CHtml()
{
}
/// <summary>
/// 过滤 Sql 语句字符串中的注入脚本
/// </summary>
/// <param name="source">传入的字符串</param>
/// <returns></returns>
public static string FilterSql(string source)
{
//单引号替换成两个单引号
source = source.Replace("'", "''");
source = source.Replace("/"", "“");
source = source.Replace("|", "|");
//半角封号替换为全角封号,防止多语句执行
source = source.Replace(";", ";");
//半角括号替换为全角括号
source = source.Replace("(", "(");
source = source.Replace(")", ")");
/**/
///要用正则表达式替换,防止字母大小写得情况
//去除执行存储过程的命令关键字
source = source.Replace("Exec", "");
source = source.Replace("Execute", "");
//去除系统存储过程或扩展存储过程关键字
source = source.Replace("xp_", "x p_");
source = source.Replace("sp_", "s p_");
//防止16进制注入
source = source.Replace("0x", "0 x");
return source;
}
/// <summary>
/// 输出HTML
/// </summary>
/// <param name="Stream">流</param>
/// <param name="Encod">编码</param>
/// <returns></returns>
public static string HtmlStr(System.IO.Stream Stream, Encoding Encod)
{
System.IO.StreamReader sr;
if (Encod != null)
{
sr = new System.IO.StreamReader(Stream, Encod);
return sr.ReadToEnd();
}
else
{
sr = new System.IO.StreamReader(Stream, Encoding.Default);
return sr.ReadToEnd();
}
}
/// <summary>
/// 检验用户提交的URL参数字符里面是否有非法字符,如果有则返回True.防止SQL注入.
/// </summary>
/// <param name="str">(string)</param>
/// <returns>bool</returns>
public static bool VerifyString(string str)
{
string strTmp = str.ToUpper();
if (strTmp.IndexOf("SELECT ") >= 0 || strTmp.IndexOf(" AND ") >= 0 || strTmp.IndexOf(" OR ") >= 0 ||
strTmp.IndexOf("EXEC ") >= 0 || strTmp.IndexOf("CHAR(") >= 0)
{
return true;
}
strTmp.Replace("'", "'").Replace(";", ";");
return false;
}
/// <summary>
/// 匹配页面的图片地址
/// </summary>
/// <param name="HtmlCode"></param>
/// <param name="imgHttp">要补充的http://路径信息</param>
/// <returns></returns>
public static string GetImgSrc(string HtmlCode, string imgHttp)
{
string MatchVale = "";
string Reg = @"<img.+?>";
foreach (Match m in Regex.Matches(HtmlCode.ToLower(), Reg))
{
MatchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "|";
}
return MatchVale;
}
/// <summary>
/// 匹配<img src=http://www.ASPcool.com/lanmu/"" />中的图片路径实际链接
/// </summary>
/// <param name="ImgString"><img src=http://www.aspcool.com/lanmu/"" />字符串</param>
/// <returns></returns>
public static string GetImg(string ImgString, string imgHttp)
{
string MatchVale = "";
string Reg = @"src=http://www.aspcool.com/lanmu/.+/.(bmp|jpg|gif|png|)";
foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg))
{
MatchVale += (m.Value).ToLower().Trim().Replace("src=http://www.aspcool.com/lanmu/", "");
}
if (MatchVale.IndexOf(".net") != -1 || MatchVale.IndexOf(".com") != -1 || MatchVale.IndexOf(".org") != -1 || MatchVale.IndexOf(".cn") != -1 || MatchVale.IndexOf(".cc") != -1 || MatchVale.IndexOf(".info") != -1 || MatchVale.IndexOf(".biz") != -1 || MatchVale.IndexOf(".tv") != -1)
return (MatchVale);
else
return (imgHttp + MatchVale);
}
/// <summary>
/// 获取页面的链接正则
/// </summary>
/// <param name="HtmlCode"></param>
/// <returns></returns>
public static string GetHref(string HtmlCode)
{
string MatchVale = "";
string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((/w|//|//|/.|:|-|_)+)[/S]*";
foreach (Match m in Regex.Matches(HtmlCode, Reg))
{
MatchVale += (m.Value).ToLower().Replace("href=http://www.aspcool.com/lanmu/", "").Trim() + "|";
}
return MatchVale;
}
/// <summary>
/// 去HTML标记
/// </summary>
/// <param name="strhtml">HTML页面</param>
/// <returns></returns>
public static string RemoveHTML(string strhtml)
{
string stroutput = strhtml;
Regex regex = new Regex(@"<[^>]+>|</[^>]+>");
stroutput = regex.Replace(stroutput, "");
return stroutput.Trim();
}
/// <summary>
/// 取网页编码
/// </summary>
/// <param name="strHtml">HTML页面</param>
/// <returns>返回编码</returns>
public static Encoding GetEncoding(string strHtml)
{
string pattern = @"(?i)/bcharset=(?<charset>[-a-zA-Z_0-9]+)";
string charset = Regex.Match(strHtml, pattern).Groups["charset"].Value;
if (charset.Length <= 0)
{
if (strHtml.Contains("charset=/""))
charset = Manager.Text.StringEx.GetStringMiddle(strHtml, "charset=/"", "/"");
}
if (charset.Length <= 0)
{
if (strHtml.Contains("charset="))
charset = Manager.Text.StringEx.GetStringMiddle(strHtml, "charset=", "/"");
}
if (charset.Length <= 0)
{
charset = Encoding.UTF8.BodyName;
}
try
{
return Encoding.GetEncoding(charset);
}
catch (Exception)
{
return Encoding.Default;
}
}
/// <summary>
/// 取IE版本
/// </summary>
/// <returns></returns>
public static string GetIEVersion()
{
using (Microsoft.Win32.RegistryKey versionKey = Microsoft.Win32.Registry.LocalMachine.OpenSubKey(@"Software/Microsoft/Internet Explorer"))
{
String version = versionKey.GetValue("Version").ToString();
return version;
}
}
}
/// <summary>
/// 模拟网页提交数据
/// </summary>
public class PostWeb
{
/// <summary>
/// Cookies
/// </summary>
public string cookieHeader = "";
/// <summary>
/// 网页编号
/// </summary>
public string Encod = "";
public bool SetCookies = false;
public string Method = "POST";
/// <summary>
/// 是否使用代理
/// </summary>
public bool IsProxy = false;
/// <summary>
/// 代理地址
/// </summary>
public string proxyaddress = "";
/// <summary>
/// 密码验证用户
/// </summary>
public string CredentialUserName = "";
/// <summary>
/// 密码验证密码
/// </summary>
public string CredentialPassword = "";
/// <summary>
/// 密码验证域名
/// </summary>
public string CredentialDoMain = "";
Encoding tmpEncod;
public PostWeb()
{
}
~PostWeb()
{
}
/// <summary>
/// 代理
/// </summary>
/// <param name="request"></param>
private void ProxySetting(HttpWebRequest request)
{
if (IsProxy)
{
WebProxy proxy = WebProxy.GetDefaultProxy();//获取IE缺省设置
//如果缺省设置为空,则有可能是根本不需要代理服务器,如果此时配置文件中也未配置则认为不需Proxy
if (proxy.Address == null && !String.IsNullOrEmpty(proxyaddress))
proxy.Address = new Uri(proxyaddress);//按配置文件创建Proxy 地置
}
}
/// <summary>
/// 身份验证
/// </summary>
/// <param name="request"></param>
private void NetworkCredentialSetting(HttpWebRequest request)
{
if (!String.IsNullOrEmpty(CredentialUserName) && !String.IsNullOrEmpty(CredentialPassWord) && !String.IsNullOrEmpty(CredentialDoMain))
{
request.PreAuthenticate = true;
NetworkCredential myCred = new NetworkCredential(CredentialUserName, CredentialPassWord, CredentialDoMain);
request.Credentials = myCred;
request.SendChunked = false;
}
}
/// <summary>
/// 验证码,并保存文件
/// </summary>
/// <param name="strURL">验证码地址</param>
/// <param name="dir">目录</param>
/// <param name="filename">文件</param>
/// <param name="imageFormat">格式</param>
public void GetCode(string strURL, string dir, string filename, System.Drawing.Imaging.ImageFormat imageFormat)
{
string path = "";
if (!String.IsNullOrEmpty(dir))
{
System.IO.Directory.CreateDirectory(dir);
path = dir + "//";
}
string FileNamePath = path + filename;
System.Drawing.Image code = GetCode(strURL);
code.Save(FileNamePath, imageFormat);
code.Dispose();
}
/// <summary>
/// 验证码,返回Bitmap
/// </summary>
/// <param name="strURL">验证码地址</param>
/// <returns></returns>
public System.Drawing.Image GetCode(string strURL)
{
HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL);
ProxySetting(myHttpWebRequest);
NetworkCredentialSetting(myHttpWebRequest);
myHttpWebRequest.Method = "GET";
myHttpWebRequest.KeepAlive = true;
CookieCollection myCookies = null;
CookieContainer myCookieContainer = new CookieContainer();
myHttpWebRequest.CookieContainer = myCookieContainer;
using (HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse())
{
cookieHeader = myHttpWebRequest.CookieContainer.GetCookieHeader(new Uri(strURL));
myCookies = response.Cookies;
System.Drawing.Image code = System.Drawing.Image.FromStream(response.GetResponseStream(), false,false);
return code;
}
}
/// <summary>
/// 功能描述:模拟登录页面,提交登录数据进行登录,并记录Header中的cookie
/// </summary>
/// <param name="strURL">登录数据提交的页面地址</param>
/// <param name="strArgs">用户登录数据</param>
/// <param name="strReferer">引用地址</param>
/// <returns>可以返回页面内容或不返回</returns>
public string LoginGet(string strURL, string strReferer)
{
HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL);
ProxySetting(myHttpWebRequest);
NetworkCredentialSetting(myHttpWebRequest);
myHttpWebRequest.AllowAutoRedirect = true;
myHttpWebRequest.KeepAlive = true;
myHttpWebRequest.Accept = " image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-Flash, application/vnd.ms-Excel, application/vnd.ms-PowerPoint, application/msword, application/xaml+XML, application/vnd.ms-xpsdocument, application/x-ms-xbap, application/x-ms-application, application/QVOD, */*";
myHttpWebRequest.Referer = strReferer;
myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; MAXTHON 2.0)";
myHttpWebRequest.ContentType = "application/x-www-form-urlencoded";
myHttpWebRequest.Method = "Get";
myHttpWebRequest.Timeout = 3000;
CookieCollection myCookies = null;
CookieContainer myCookieContainer = new CookieContainer();
if (cookieHeader.Length > 0)
{
myCookieContainer.SetCookies(new Uri(strURL), cookieHeader);
myHttpWebRequest.CookieContainer = myCookieContainer;
}
using (HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse())
{
cookieHeader = myHttpWebRequest.CookieContainer.GetCookieHeader(new Uri(strURL));
myCookies = response.Cookies;
SetEncod(response.CharacterSet);
return System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode(CHtml.HtmlStr(response.GetResponseStream(), tmpEncod)));
}
}
/// <summary>
/// 设置网页编码
/// </summary>
void SetEncod(string cod)
{
if (tmpEncod == null)
{
if (Encod.Length > 0)
tmpEncod = Encoding.GetEncoding(Encod);
else
tmpEncod = Encoding.GetEncoding(cod);
}
}
/// <summary>
/// 功能描述:模拟登录页面,提交登录数据进行登录,并记录Header中的cookie
/// LoginPost("http://www.mystand.com.cn/login/submit.jsp","userid=hgj0000&password=06045369","http://www.mystand.com.cn/");
/// </summary>
/// <param name="strURL">登录数据提交的页面地址</param>
/// <param name="strArgs">用户登录数据</param>
/// <param name="strReferer">引用地址</param>
/// <returns>可以返回页面内容或不返回</returns>
public string LoginPost(string strURL, string strArgs, string strReferer)
{
HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL);
ProxySetting(myHttpWebRequest);
NetworkCredentialSetting(myHttpWebRequest);
myHttpWebRequest.AllowAutoRedirect = true;
myHttpWebRequest.KeepAlive = true;
myHttpWebRequest.Accept = " image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-PowerPoint, application/msword, application/xaml+xml, application/vnd.ms-xpsdocument, application/x-ms-xbap, application/x-ms-application, application/QVOD, */*";
myHttpWebRequest.Referer = strReferer;
myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; MAXTHON 2.0)";
myHttpWebRequest.ContentType = "application/x-www-form-urlencoded";
myHttpWebRequest.Method = "POST";
myHttpWebRequest.Timeout = 3000;
CookieCollection myCookies = null;
CookieContainer myCookieContainer = new CookieContainer();
if (cookieHeader.Length > 0)
{
myCookieContainer.SetCookies(new Uri(strURL), cookieHeader);
myHttpWebRequest.CookieContainer = myCookieContainer;
}
Stream MyRequestStrearm = myHttpWebRequest.GetRequestStream();
StreamWriter MyStreamWriter = new StreamWriter(MyRequestStrearm, Encoding.ASCII);
//把数据写入HttpWebRequest的Request流
MyStreamWriter.Write(strArgs);
//关闭打开对象
MyStreamWriter.Close();
MyRequestStrearm.Close();
using (HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse())
{
cookieHeader = myHttpWebRequest.CookieContainer.GetCookieHeader(new Uri(strURL));
myCookies = response.Cookies;
SetEncod(response.CharacterSet);
return System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode( CHtml.HtmlStr(response.GetResponseStream(), tmpEncod)));
}
}
/// <summary>
/// 功能描述:在PostLogin成功登录后记录下Headers中的cookie,然后获取此网站上其他页面的内容
/// </summary>
/// <param name="strURL">获取网站的某页面的地址</param>
/// <param name="strReferer">引用的地址</param>
/// <returns>返回页面内容</returns>
public string GetPage(string strURL, string strReferer)
{
HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL);
ProxySetting(myHttpWebRequest);
NetworkCredentialSetting(myHttpWebRequest);
myHttpWebRequest.ContentType = "text/html";
myHttpWebRequest.Method = "GET";
// myHttpWebRequest.Timeout = 5000;
if (!string.IsNullOrEmpty(strReferer))
myHttpWebRequest.Referer = strReferer;
myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; MAXTHON 2.0)";
myHttpWebRequest.Headers.Add("cookie:" + cookieHeader);
CookieContainer myCookieContainer = new CookieContainer();
Console.WriteLine(myHttpWebRequest.TransferEncoding);
using (HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse())
{
SetEncod(response.CharacterSet);
return System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode(CHtml.HtmlStr(response.GetResponseStream(), tmpEncod)));
}
}
}
/// <summary>
/// 取网页数据
/// </summary>
public class GetHtml
{
public GetHtml()
{
}
~GetHtml()
{
}
/// <summary>
/// 模拟提交表单POST
/// System.Collections.Specialized.NameValueCollection PostVars =new System.Collections.Specialized.NameValueCollection()
/// PostVars.Add("uid","name");
/// PostVars.Add("pwd","123456");
/// GetStrHtmlPost(url,PostVars)
/// </summary>
/// <param name="url">地址</param>
/// <param name="PostVars">PostValue</param>
/// <returns></returns>
public static string GetStrHtmlPost(String url, System.Collections.Specialized.NameValueCollection PostVars)
{
if (PostVars == null)
return "";
System.Net.WebClient WebClientObj = new System.Net.WebClient();
string html;
try
{
byte[] buf = WebClientObj.UploadValues(url, "POST", PostVars);
//下面都没用啦,就上面一句话就可以了
html = System.Text.Encoding.Default.GetString(buf);
Encoding encoding = CHtml.GetEncoding(html);
if (encoding == Encoding.UTF8) return html;
return encoding.GetString(buf);
}
catch
{
}
return "";
}
/// <summary>
/// 获取网页的HTML内容
/// </summary>
/// <param name="url">url</param>
/// <returns></returns>
public static string GetStrHtml(string url)
{
return GetStrHtml(url, null);
}
/// <summary>
/// 获取网页的HTML内容
/// </summary>
/// <param name="url">URL</param>
/// <param name="encoding">Encoding</param>
/// <returns></returns>
public static string GetStrHtml(string url, Encoding encoding)
{
byte[] buf = new WebClient().DownloadData(url);
if (encoding != null) return encoding.GetString(buf);
string html = Encoding.UTF8.GetString(buf);
encoding = CHtml.GetEncoding(html);
if (encoding == Encoding.UTF8) return html;
return encoding.GetString(buf);
}
}
}
注册了这个久终于想好了写什么了。
最新推荐文章于 2024-07-08 17:28:27 发布