之前做了一个通过IP来扫描网站的,以为能扫到所有的网站,因为IP是能全部遍历的,但是忘记了很多网站是禁止反向解析的。
所以更改了写法,仿照类似爬虫的方法,抓取一个网页的内容,然后遍历其中的网址,然后根据其中的网址再去遍历内容,如此做了个简单的网页标题信息采集工具.
--更新,1.使用多线程处理;2.使用缓存技术减少DB访问;3.优化网页编码获取;4.增加错误日志记录及错误网址检测
主要代码如下:
using System;
using System.Threading;
using System.Collections.Generic;
using System.Windows.Forms;
using System.Linq;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Data;
using System.Data.OracleClient;
using System.Collections;
namespace netCollect
{
class Program
{
static ulong count = 0;
static ulong countTimes = 0;
static public StreamWriter logFile;
static Hashtable urlHash = new Hashtable();
static Mutex myMutex = new Mutex();
static DataTable dt = null;//使用全局dt,将已有的信息读入缓存,以减少数据库访问次数
static int threadFlag = 0;//线程结束控制标志
static string GetTitle(string html)
{
string regex = @"(?<=<title.*>)([\s\S]*)(?=</title>)";
//正向预搜索与反向预搜索:http://www.rczjp.cn/HTML/120709/20120409090416.html
Regex reg = new Regex(regex, RegexOptions.IgnoreCase);
return reg.Match(html).Value.Trim();
}
static MatchCollection GetUrl(string html)
{
string regex = @"(?<=http://)[\w\.]+[^/]";
Regex reg = new Regex(regex, RegexOptions.IgnoreCase);
return reg.Matches(html);
}
static void Main(string[] args)
{
try
{
if (!DBAccess.DBConnect("cry", "cry", "orcl"))
{
MessageBox.Show("数据库连接失败!", "错误001", MessageBoxButtons.OK, MessageBoxIcon.Error);
Application.Exit();
}
dt = DBAccess.selectDB("select ip,url from net_collect ");//因为ip,url做的联合主键,所以唯一标志一条记录
Console.WriteLine("请输入开始扫描的网址(2个,逗号隔开):");
string url = Console.ReadLine();
string[] urlScan = url.Split(',');//"www.265.com";
int count = 0;
logFile = new StreamWriter("d:\\net_collectnew.log", true);
DateTime startTime = DateTime.Now;
logFile.WriteLine("开始时间:" + DateTime.Now.ToString());
Thread t = new Thread(new ParameterizedThreadStart(ipScan));
t.Start(urlScan[0]);
Thread t1 = new Thread(new ParameterizedThreadStart(ipScan));
t1.Start(urlScan[1]);
while (true) { if (2 == threadFlag) break; };
DBAccess.DBClose();
logFile.WriteLine("结束时间:" + DateTime.Now.ToString());
logFile.Close();
TimeSpan ts = DateTime.Now - startTime;
Console.WriteLine("共扫描网址:" + count + "个");
Console.WriteLine("共扫描次数:" + countTimes + "次");
Console.WriteLine("总共花费时间:" + ts.ToString());
Console.ReadKey();
}
catch (Exception ex)
{
logFile.WriteLine("error:[main]" + ex.Message);
Console.WriteLine(ex.Message);
}
}
//使用多线程调用
static void ipScan(object obj)
{
string[] html = new string[2];
html = GetHtmlInfo(obj.ToString(), 60000, Encoding.Default);
threadFlag++;
Console.WriteLine(threadFlag+"号线程已停止");
logFile.WriteLine(threadFlag + "号线程已停止");
}
/// <summary>
/// 获取页面的HTML信息,到标题(</title>)位置结束
/// </summary>
/// <param name="url">页面地址</param>
/// <param name="timeout">超时时间,单位:ms</param>
/// <param name="EnCodeType">编码</param>
/// <returns></returns>
static string[] GetHtmlInfo(string url, int timeout, Encoding EnCodeType)
{
string[] urlInfo = new string[2];
url = url.TrimEnd('"').TrimEnd('?').TrimEnd('\'').TrimEnd(':').TrimEnd('.').TrimEnd('-').Replace("'", "");//干掉网址中不规范的符号
if (url == "www"||url.Length<3||!url.Replace("www.","").Contains(".")) return null;
if (!urlHash.ContainsKey(url))
{
lock (urlHash)//加锁,防止重复写入
{
myMutex.WaitOne();
urlHash.Add(url, url);//hash表来防止重复网址扫描
myMutex.ReleaseMutex();
}
}
if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; }
string result = "";
StreamReader reader = null;
string temp = "";
Encoding enc;
HttpWebRequest request = null;
HttpWebResponse response = null;
string encoding = "";
try
{
request = (HttpWebRequest)HttpWebRequest.Create(url);//初始化WebRequest
request.Timeout = timeout;
request.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729;.NET CLR 1.0.3705)";
//request.Headers.Add("Accept-Language", "zh-cn");
//request.Headers.Add("Accept-Encoding", "gzip, deflate");
request.Accept = "*/*";
request.AllowAutoRedirect = false;
//request.ServicePoint.Expect100Continue = false;
request.KeepAlive = true;
//request.CookieContainer = new CookieContainer();
request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
response = (HttpWebResponse)request.GetResponse();//返回来自Internet的响应
if (response.StatusCode == System.Net.HttpStatusCode.MovedPermanently)//获取重定向的网页
{
request = (HttpWebRequest)HttpWebRequest.Create(response.Headers["Location"]);//初始化WebRequest
response = (HttpWebResponse)request.GetResponse();//返回来自Internet的响应
}
System.Net.IPAddress[] addresslist = Dns.GetHostEntry(url.Replace("https://","").Replace("http://","")).AddressList;
if (addresslist.Length != 0) urlInfo[0] = addresslist[0].ToString().Trim();//获取IP
else { logFile.WriteLine("error:获取IP失败! [URL]=" + url); return null; }
countTimes++;
Console.WriteLine("第[" + countTimes + "]次Scan, url=" + url.Replace("https://", "").Replace("http://", ""));
//DataTable dt=DBAccess.selectDB("select count(*) from net_collect where ip='"+urlInfo[0].ToString()+"' and url='"+url+"' ");
DataRow[] dr = dt.Select(@"ip='" + urlInfo[0].ToString() + "' and url='" + url.TrimEnd('"').TrimEnd('?').TrimEnd('\'') + "' ");
if (dr.Length>0&&count!=0)//递归终止条件
return null;
DataTable dt1 = DBAccess.selectDB("select count(*) from net_collect where ip='" + urlInfo[0].ToString() + "' and url='" + url + "' ");
if (null == dt1) { logFile.WriteLine("error:数据库查询net_collect失败! [URL]=" + url); return null; }
if (dt1.Rows[0][0].ToString() != "0" && count != 0)//递归终止条件
return null;
lock (url)//加锁,防止重复写入
{ myMutex.WaitOne(); encoding = GetHtmlCode(url); myMutex.ReleaseMutex(); }
if (encoding == null || encoding == string.Empty || encoding.Contains("--"))
{
encoding = response.CharacterSet;
encoding = (encoding == "ISO-8859-1") ? "gb2312" : encoding;
}
enc = Encoding.GetEncoding(encoding);
if (response.StatusCode == System.Net.HttpStatusCode.OK)
{
StringBuilder builder = new StringBuilder();
Stream stream = response.GetResponseStream();
reader = new StreamReader(stream, enc);
string tmp = "";
while ((temp = reader.ReadLine()) != null)
{
builder.Append(temp);
tmp = builder.ToString();
if (urlInfo[1]==null&&tmp.IndexOf("</title>") > 0 )
urlInfo[1] = GetTitle(tmp);
builder.Append("\r\n");
}
result = builder.ToString();
}
if (urlInfo[1].Length > 3000) urlInfo[1] = urlInfo[1].Substring(0, 2800);
MatchCollection mc = GetUrl(result);
if (urlInfo[0] != null && urlInfo[1] != "无法连接到远程服务器" && url!=null)
{
DBAccess.DBExecSql(@"insert into net_collect values('" + urlInfo[0] + "',default,q'[" + urlInfo[1] + "]','" + url + "',default)");
count++;
Console.WriteLine("[" + count + "]" + "已写入数据库,信息为[url:]" + url + " [ip:]" + urlInfo[0]);
logFile.WriteLine("[" + count + "]" + "已写入数据库,信息为[url:]" + url + " [ip:]" + urlInfo[0]);
}
if(mc.Count!=0)
foreach (Match match in mc)
{
if (urlHash.ContainsKey(match.ToString())) continue;
if (match.ToString().TrimEnd('"').TrimEnd('?').TrimEnd('\'') == "www") continue;
Console.WriteLine("开始扫描网址:" + match.ToString());
logFile.WriteLine("开始扫描网址:" + match.ToString());
GetHtmlInfo(match.ToString().TrimEnd('"').TrimEnd('?').TrimEnd('\''), timeout, EnCodeType);//递归
}
return urlInfo;
}
catch (Exception ex)
{
logFile.WriteLine("error:--"+ex.Message+" [URL]="+url);
Console.WriteLine(ex.Message);
urlInfo[1] = ex.Message;
return urlInfo;
}
finally { if (reader != null) { reader.Close(); } if (response != null) { response.Close(); } if (request != null) { request.Abort(); } }
}
static string GetHtmlCode(string url)
{
string charset = "";
try
{
string htmlCode;
string temp = "";
string pattern = @"(?i)\bcharset=(?<charset>[-a-zA-Z_0-9]+)";
HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
webRequest.Timeout = 30000;
webRequest.Method = "GET";
webRequest.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729;.NET CLR 1.0.3705)";
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
HttpWebResponse webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();
if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压
{
using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
{
using (var zipStream =
new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))
{
using (StreamReader sr = new System.IO.StreamReader(zipStream, Encoding.Default))
{
string tmp = "";
while ((temp = sr.ReadLine()) != null)
{
StringBuilder builder = new StringBuilder();
builder.Append(temp);
tmp = builder.ToString();
if (tmp.IndexOf("charset") > 0)
{
charset = Regex.Match(tmp, pattern).Groups["charset"].Value;
if (charset.Length < 3) //charset = Regex.Match(tmp, "/<meta(.*)charset=(\'|\")(.*)(\'|\")>/i").Groups["charset"].Value;
{ //匹配 <meta charset="utf-8" /> 这种情况
charset = tmp.Substring(tmp.IndexOf("=") + 1, tmp.Length - tmp.IndexOf("=") - 1).Replace("\"", "").Replace("/>", "").Trim();
charset = charset.ToLower().Contains("utf-8") ? "utf-8" : "gb2312";//强制2选1,呵呵
}
return charset;
}
builder.Append("\r\n");
}
}
}
}
}
else
{
using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
{
using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, Encoding.Default))
{
string tmp = "";
while ((temp = sr.ReadLine()) != null)
{
StringBuilder builder = new StringBuilder();
builder.Append(temp);
tmp = builder.ToString();
if (tmp.IndexOf("charset") > 0)
{
charset = Regex.Match(tmp, pattern).Groups["charset"].Value;
if (charset.Length<3) //charset = Regex.Match(tmp, "/<meta(.*)charset=(\'|\")(.*)(\'|\")>/i").Groups["charset"].Value;
{ //匹配 <meta charset="utf-8" /> 这种情况
charset = tmp.Substring(tmp.IndexOf("=") + 1, tmp.Length - tmp.IndexOf("=") - 1).Replace("\"", "").Replace("/>", "").Trim();
charset = charset.ToLower().Contains("utf-8") ? "utf-8" : "gb2312";//强制2选1,呵呵
}
return charset;
}
builder.Append("\r\n");
}
}
}
}
webResponse.Close();
webRequest.Abort();
return charset;
}
catch (Exception ex)
{
logFile.WriteLine("error:--" + ex.Message + " [charset]=" + charset);
Console.WriteLine(ex.Message);
return null;
}
}
}
}
运行如图:
---日志文件部分内容如下:
开始时间:2013-5-14 19:22:37
[1]已写入数据库,信息为[url:]http://www.265.com [ip:]203.208.37.17
开始扫描网址:www.w3.org
开始扫描网址:www.google.com.hk
开始扫描网址:news.google.com.hk
开始扫描网址:ditu.google.cn
[2]已写入数据库,信息为[url:]http://ditu.google.cn [ip:]203.208.36.18
开始扫描网址:schema.org
[3]已写入数据库,信息为[url:]http://schema.org [ip:]74.125.31.100
开始扫描网址:www.google.com
开始扫描网址:blog.schema.org"
开始扫描网址:www.google.cn
[4]已写入数据库,信息为[url:]http://www.google.cn [ip:]203.208.36.19
开始扫描网址:translate.google.cn
[5]已写入数据库,信息为[url:]http://translate.google.cn [ip:]203.208.37.17
开始扫描网址:picasaweb.google.com
开始扫描网址:video.google.cn
[6]已写入数据库,信息为[url:]http://video.google.cn [ip:]203.208.46.177
开始扫描网址:www.miibeian.gov.cn
开始扫描网址:www.gstatic.com
开始扫描网址:translate.google.com
...
-----------------------------------------
不上传工具了,如对此感兴趣者,可直接联系我索要源码及工具.QQ:278676125