网页标题信息采集

--------------------------------------------------------------------------
-----------------------------Cryking原创------------------------------
-----------------------转载请注明出处,谢谢!------------------------

之前做了一个通过IP来扫描网站的,以为能扫到所有的网站,因为IP是能全部遍历的,但是忘记了很多网站是禁止反向解析的。

所以更改了写法,仿照类似爬虫的方法,抓取一个网页的内容,然后遍历其中的网址,然后根据其中的网址再去遍历内容,如此做了个简单的网页标题信息采集工具.

--更新,1.使用多线程处理;2.使用缓存技术减少DB访问;3.优化网页编码获取;4.增加错误日志记录及错误网址检测

主要代码如下:

using System;
using System.Threading;
using System.Collections.Generic;
using System.Windows.Forms;
using System.Linq;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Data;
using System.Data.OracleClient;
using System.Collections;

namespace netCollect
{
    class Program
    {
        static  ulong count = 0;
        static ulong countTimes = 0;
        static public StreamWriter logFile;
        static Hashtable urlHash = new Hashtable();
        static Mutex myMutex = new Mutex();
        static DataTable dt = null;//使用全局dt,将已有的信息读入缓存,以减少数据库访问次数
        static int threadFlag = 0;//线程结束控制标志

        static string GetTitle(string html)
        {


            string regex = @"(?<=<title.*>)([\s\S]*)(?=</title>)";

            //正向预搜索与反向预搜索:http://www.rczjp.cn/HTML/120709/20120409090416.html

            Regex reg = new Regex(regex, RegexOptions.IgnoreCase);

            return reg.Match(html).Value.Trim();

        }

        static MatchCollection GetUrl(string html)
        {
            string regex = @"(?<=http://)[\w\.]+[^/]"; 
            Regex reg = new Regex(regex, RegexOptions.IgnoreCase);
            return reg.Matches(html);

        }

        static void Main(string[] args)
        {
            try
            {
                if (!DBAccess.DBConnect("cry", "cry", "orcl"))
                {
                    MessageBox.Show("数据库连接失败!", "错误001", MessageBoxButtons.OK, MessageBoxIcon.Error);
                    Application.Exit();
                }
                dt = DBAccess.selectDB("select ip,url from net_collect ");//因为ip,url做的联合主键,所以唯一标志一条记录
                Console.WriteLine("请输入开始扫描的网址(2个,逗号隔开):");
                string url = Console.ReadLine();
                string[] urlScan = url.Split(',');//"www.265.com";
                int count = 0;
                logFile = new StreamWriter("d:\\net_collectnew.log", true);
                DateTime startTime = DateTime.Now;
                logFile.WriteLine("开始时间:" + DateTime.Now.ToString());
                Thread t = new Thread(new ParameterizedThreadStart(ipScan));
                t.Start(urlScan[0]);
                Thread t1 = new Thread(new ParameterizedThreadStart(ipScan));
                t1.Start(urlScan[1]);
                while (true) { if (2 == threadFlag) break; };
                DBAccess.DBClose();
                logFile.WriteLine("结束时间:" + DateTime.Now.ToString());
                logFile.Close();
                TimeSpan ts = DateTime.Now - startTime;
                Console.WriteLine("共扫描网址:" + count + "个");
                Console.WriteLine("共扫描次数:" + countTimes + "次");
                Console.WriteLine("总共花费时间:" + ts.ToString());
                Console.ReadKey();
            }
            catch (Exception ex)
            {
                logFile.WriteLine("error:[main]" + ex.Message);
                Console.WriteLine(ex.Message);
            }

        }

        //使用多线程调用
        static void ipScan(object obj)
        {
            string[] html = new string[2];
            html = GetHtmlInfo(obj.ToString(), 60000, Encoding.Default);
            threadFlag++;
            Console.WriteLine(threadFlag+"号线程已停止");
            logFile.WriteLine(threadFlag + "号线程已停止");
        }
        /// <summary>
        /// 获取页面的HTML信息,到标题(</title>)位置结束
        /// </summary>
        /// <param name="url">页面地址</param>
        /// <param name="timeout">超时时间,单位:ms</param>
        /// <param name="EnCodeType">编码</param>
        /// <returns></returns>

        static string[] GetHtmlInfo(string url, int timeout, Encoding EnCodeType)
        {
            string[] urlInfo = new string[2];
            url = url.TrimEnd('"').TrimEnd('?').TrimEnd('\'').TrimEnd(':').TrimEnd('.').TrimEnd('-').Replace("'", "");//干掉网址中不规范的符号
            if (url == "www"||url.Length<3||!url.Replace("www.","").Contains(".")) return null;
            if (!urlHash.ContainsKey(url))
            {
                lock (urlHash)//加锁,防止重复写入
                {
                    myMutex.WaitOne();
                    urlHash.Add(url, url);//hash表来防止重复网址扫描
                    myMutex.ReleaseMutex();
                }
            }
            if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; }
            string result = "";
            StreamReader reader = null;
            string temp = "";
            Encoding enc;
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            string encoding = "";
            try
            {

                request = (HttpWebRequest)HttpWebRequest.Create(url);//初始化WebRequest

                request.Timeout = timeout;
                request.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729;.NET CLR 1.0.3705)";
                //request.Headers.Add("Accept-Language", "zh-cn");
                //request.Headers.Add("Accept-Encoding", "gzip, deflate");
                request.Accept = "*/*";
                request.AllowAutoRedirect = false;
                //request.ServicePoint.Expect100Continue = false;
                request.KeepAlive = true;
                //request.CookieContainer = new CookieContainer();

                request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");

                response = (HttpWebResponse)request.GetResponse();//返回来自Internet的响应
                if (response.StatusCode == System.Net.HttpStatusCode.MovedPermanently)//获取重定向的网页
                {
                    request = (HttpWebRequest)HttpWebRequest.Create(response.Headers["Location"]);//初始化WebRequest
                    response = (HttpWebResponse)request.GetResponse();//返回来自Internet的响应
                }
                System.Net.IPAddress[] addresslist = Dns.GetHostEntry(url.Replace("https://","").Replace("http://","")).AddressList;
                if (addresslist.Length != 0) urlInfo[0] = addresslist[0].ToString().Trim();//获取IP
                else { logFile.WriteLine("error:获取IP失败! [URL]=" + url); return null; }
                countTimes++;
                Console.WriteLine("第[" + countTimes + "]次Scan, url=" + url.Replace("https://", "").Replace("http://", ""));
                //DataTable dt=DBAccess.selectDB("select count(*) from net_collect where ip='"+urlInfo[0].ToString()+"' and url='"+url+"' ");
                DataRow[] dr = dt.Select(@"ip='" + urlInfo[0].ToString() + "' and url='" + url.TrimEnd('"').TrimEnd('?').TrimEnd('\'') + "' ");
                if (dr.Length>0&&count!=0)//递归终止条件
                    return null;
                DataTable dt1 = DBAccess.selectDB("select count(*) from net_collect where ip='" + urlInfo[0].ToString() + "' and url='" + url + "' ");
                if (null == dt1) { logFile.WriteLine("error:数据库查询net_collect失败! [URL]=" + url); return null; }
                if (dt1.Rows[0][0].ToString() != "0" && count != 0)//递归终止条件
                    return null;
                lock (url)//加锁,防止重复写入
                { myMutex.WaitOne(); encoding = GetHtmlCode(url); myMutex.ReleaseMutex(); }
                if (encoding == null || encoding == string.Empty || encoding.Contains("--"))
                {
                    encoding = response.CharacterSet;
                    encoding = (encoding == "ISO-8859-1") ? "gb2312" : encoding;
                }
                enc = Encoding.GetEncoding(encoding);
                if (response.StatusCode == System.Net.HttpStatusCode.OK)
                {

                    StringBuilder builder = new StringBuilder();
                    Stream stream = response.GetResponseStream();

                    reader = new StreamReader(stream, enc);
                    string tmp = "";
                    while ((temp = reader.ReadLine()) != null)
                    {
                        builder.Append(temp);

                        tmp = builder.ToString();

                        if (urlInfo[1]==null&&tmp.IndexOf("</title>") > 0 )
                            urlInfo[1] = GetTitle(tmp);

                        builder.Append("\r\n");


                    }
                    
                    result = builder.ToString();
                    
                }
                if (urlInfo[1].Length > 3000) urlInfo[1] = urlInfo[1].Substring(0, 2800);
                    MatchCollection mc = GetUrl(result);

                    if (urlInfo[0] != null && urlInfo[1] != "无法连接到远程服务器" && url!=null)
                    {

                        DBAccess.DBExecSql(@"insert into net_collect values('" + urlInfo[0] + "',default,q'[" + urlInfo[1] + "]','" + url + "',default)");
                        count++;
                        Console.WriteLine("[" + count + "]" + "已写入数据库,信息为[url:]" + url + "  [ip:]" + urlInfo[0]);
                        logFile.WriteLine("[" + count + "]" + "已写入数据库,信息为[url:]" + url + "  [ip:]" + urlInfo[0]);
                    }
                    
                    if(mc.Count!=0)
                    foreach (Match match in mc)
                    {
                        if (urlHash.ContainsKey(match.ToString())) continue;
                        if (match.ToString().TrimEnd('"').TrimEnd('?').TrimEnd('\'') == "www") continue;
                        Console.WriteLine("开始扫描网址:" + match.ToString());
                        logFile.WriteLine("开始扫描网址:" + match.ToString());
                        GetHtmlInfo(match.ToString().TrimEnd('"').TrimEnd('?').TrimEnd('\''), timeout, EnCodeType);//递归
                        
                    }

                    

                    return urlInfo;

            }

            catch (Exception ex)
            {
                logFile.WriteLine("error:--"+ex.Message+" [URL]="+url);
                Console.WriteLine(ex.Message);
                urlInfo[1] = ex.Message;
                return urlInfo;
            }

            finally { if (reader != null) { reader.Close(); } if (response != null) { response.Close(); } if (request != null) { request.Abort(); } }

        }

        static string GetHtmlCode(string url)
        {
            string charset = "";
            try
            {
                string htmlCode;
                string temp = "";
                string pattern = @"(?i)\bcharset=(?<charset>[-a-zA-Z_0-9]+)";
                HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
                webRequest.Timeout = 30000;
                webRequest.Method = "GET";
                webRequest.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729;.NET CLR 1.0.3705)";
                webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
                HttpWebResponse webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();
                if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压
                {
                    using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
                    {
                        using (var zipStream =
                            new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))
                        {
                            using (StreamReader sr = new System.IO.StreamReader(zipStream, Encoding.Default))
                            {
                                string tmp = "";
                                while ((temp = sr.ReadLine()) != null)
                                {
                                    StringBuilder builder = new StringBuilder();
                                    builder.Append(temp);

                                    tmp = builder.ToString();

                                    if (tmp.IndexOf("charset") > 0)
                                    {
                                        charset = Regex.Match(tmp, pattern).Groups["charset"].Value;
                                        if (charset.Length < 3) //charset = Regex.Match(tmp, "/<meta(.*)charset=(\'|\")(.*)(\'|\")>/i").Groups["charset"].Value;
                                        { //匹配 <meta charset="utf-8" /> 这种情况
                                            charset = tmp.Substring(tmp.IndexOf("=") + 1, tmp.Length - tmp.IndexOf("=") - 1).Replace("\"", "").Replace("/>", "").Trim();
                                            charset = charset.ToLower().Contains("utf-8") ? "utf-8" : "gb2312";//强制2选1,呵呵
                                        }
                                        return charset;
                                    }
                                    builder.Append("\r\n");
                                }
                            }
                        }
                    }
                }
                else
                {
                    using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
                    {
                        using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, Encoding.Default))
                        {
                            string tmp = "";
                            while ((temp = sr.ReadLine()) != null)
                            {
                                StringBuilder builder = new StringBuilder();
                                builder.Append(temp);

                                tmp = builder.ToString();

                                if (tmp.IndexOf("charset") > 0)
                                {
                                    charset = Regex.Match(tmp, pattern).Groups["charset"].Value;
                                    if (charset.Length<3) //charset = Regex.Match(tmp, "/<meta(.*)charset=(\'|\")(.*)(\'|\")>/i").Groups["charset"].Value;
                                    { //匹配 <meta charset="utf-8" /> 这种情况
                                        charset = tmp.Substring(tmp.IndexOf("=") + 1, tmp.Length - tmp.IndexOf("=") - 1).Replace("\"", "").Replace("/>", "").Trim();
                                        charset = charset.ToLower().Contains("utf-8") ? "utf-8" : "gb2312";//强制2选1,呵呵
                                    }
                                    return charset;
                                }
                                builder.Append("\r\n");
                             }

                        }
                    }
                }
                webResponse.Close();
                webRequest.Abort();
                return charset;
            }
            catch (Exception ex)
            {
                logFile.WriteLine("error:--" + ex.Message + " [charset]=" + charset);
                Console.WriteLine(ex.Message);
                return null;
            }
        }
    }
}


运行如图:


---日志文件部分内容如下:

开始时间:2013-5-14 19:22:37
[1]已写入数据库,信息为[url:]http://www.265.com [ip:]203.208.37.17
开始扫描网址:www.w3.org
开始扫描网址:www.google.com.hk
开始扫描网址:news.google.com.hk
开始扫描网址:ditu.google.cn
[2]已写入数据库,信息为[url:]http://ditu.google.cn [ip:]203.208.36.18
开始扫描网址:schema.org
[3]已写入数据库,信息为[url:]http://schema.org [ip:]74.125.31.100
开始扫描网址:www.google.com
开始扫描网址:blog.schema.org"
开始扫描网址:www.google.cn
[4]已写入数据库,信息为[url:]http://www.google.cn [ip:]203.208.36.19
开始扫描网址:translate.google.cn
[5]已写入数据库,信息为[url:]http://translate.google.cn [ip:]203.208.37.17
开始扫描网址:picasaweb.google.com
开始扫描网址:video.google.cn
[6]已写入数据库,信息为[url:]http://video.google.cn [ip:]203.208.46.177
开始扫描网址:www.miibeian.gov.cn
开始扫描网址:www.gstatic.com
开始扫描网址:translate.google.com

...

-----------------------------------------

不上传工具了,如对此感兴趣者,可直接联系我索要源码及工具.QQ:278676125

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
简单C#信息采集工具实现 http://blog.csdn.net/xiaoxiao108/archive/2011/06/01/6458367.aspx 最近想整只爬虫玩玩,顺便熟悉下正则表达式。 开发环境 vs2008 sql2000 实现方法如下 1.先抓取网页代码 2.通过正则匹配出你需要的内容 比如http://www.soso.com/q?w=%C4%E3%BA%C3&pg=1 页面中 搜索结果的标题跟连接地址。具体可以根据你的需要填写合适的地址跟正则。 3.把匹配出的内容保存到数据库中。对其中的数据可以根据需要自己进行处理 具体实现代码 1.读取网页的代码 public static string GetDataFromUrl(string url) { string str = string.Empty; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); //设置Http头; request.AllowAutoRedirect = true; request.AllowWriteStreamBuffering = true; request.Referer = ""; request.Timeout = 10 * 1000; //request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)"; HttpWebResponse response = null; try { response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK) { //根据http应答头来判别编码 string Characterset = response.CharacterSet; Encoding encode; if (Characterset != "") { if (Characterset == "ISO-8859-1") { Characterset = "gb2312"; } encode = Encoding.GetEncoding(Characterset); } else { encode = Encoding.Default; } //声明一个内存流来贮存http应答流 Stream Receivestream = response.GetResponseStream(); MemoryStream mstream = new MemoryStream(); byte[] bf = new byte[255]; int count = Receivestream.Read(bf, 0, 255); while (count > 0) { mstream.Write(bf, 0, count); count = Receivestream.Read(bf, 0, 255); } Receivestream.Close(); mstream.Seek(0, SeekOrigin.Begin); //从内存流里读取字符串这里涉及到了编码方案 StreamReader reader = new StreamReader(mstream, encode); char[] buf = new char[1024]; count = reader.Read(buf, 0, 1024); while (count > 0) { str += new string(buf, 0, 1024); count = reader.Read(buf, 0, 1024); } reader.Close(); mstream.Close(); } } catch (Exception ex) { GetDataFromUrl(url); } finally { if (response != null) response.Close(); } return str; } 2.正则匹配的代码 public static ArrayList GetString(string reg, string content) { Regex r = new Regex(reg, RegexOptions.Compiled); MatchCollection matches = r.Matches(content); ArrayList a = new ArrayList(); foreach (Match m in matches) { string[] arr = new string[10]; arr[0] = m.Groups[1].Value; arr[1] = m.Groups[2].Value; arr[2] = m.Groups[3].Value; arr[3] = m.Groups[4].Value; arr[4] = m.Groups[5].Value; arr[5] = m.Groups[6].Value; arr[6] = m.Groups[7].Value; arr[7] = m.Groups[8].Value; arr[8] = m.Groups[9].Value; arr[9] = m.Groups[10].Value; a.Add(arr); } return a; } 3.如果抓取的页面很多 ,可以把多线程跟队列应用过来,提高抓取效率 Queue numbers = new Queue(); const int MaxCount = 5;//同时运行的最多线程数 private static object _lock = new object(); private void Test() { while (true) { int i = 0; lock (_lock) { if (numbers.Count == 0) { flag = false; return; } i = numbers.Dequeue(); } f(i); } } void Ssss() { for (int i = 1; i <= 100; i++)//处理的页面参数 从http://www.soso.com/q?w=你好&pg=1 到http://www.soso.com/q?w=你好&pg=100 { numbers.Enqueue(i); } for (int i = 0; i < MaxCount; i++) { Thread thread = new Thread(new ThreadStart(Test)); thread.Name = "T" + i.ToString(); thread.Start(); } } private void f(int num) { string str = ClassLibrary1.Class1.GetDataFromUrl("http://www.soso.com/q?w=%C4%E3%BA%C3&pg="+num); string reg = "]+? target=\"_blank\">([\\s\\S]+?)"; ArrayList a = ClassLibrary1.Class1.GetString(reg, str); for (int i = 0; i ] 除了>以为的字符 [\u4e00-\u9fa5] 汉字 6.代码只是实现了信息采集的主要功能,根据你自己的需要更换采集页面,跟合适的正则表达式后,可以根据你的需要自动进行采集,对采集到的数据,再根据你的需要自己进行处理。 7.数据库操作部分用的3层代码生成器连接地址 在 app.config中 如果你发现有什么不合理的,需要改进的地方,联系328452421@qq.com 朱晓 。相互交流 谢谢 顺便问下 有家是新泰的没,搞软件开发 地

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值