C#版采集程序源码

c因为工作需要,自己写了一个采集程序,如果冒犯了你的网站,我在这里说一声对不起 !!
 哎~!我只是一个普通的程序员 .
namespace CJ
{
    public partial class Form1 : Form
    {
        public int proxy = 0;
        public int keyi = 0;
        public int keyj = 0;
        public int keym = 0;
        public int keyn = 0;
        public int sum = 0;

        public string newurl = "";
        public string cururl = "";
        public string dirname = "";
        public string curdir = "";

        public string responseFromServer = "";
        public string filename = "";
        public string sql = "";
        public string mulu = "";

        StringBuilder sbs = new StringBuilder();
        List<Class1> cls = new List<Class1>();
        public ArrayList al = new ArrayList();

        public string insertdl = "insert into mzinedl values(";
        public string insertxl = "insert into mzinexl values(";
        public string insertinfo = "insert into mzineinfo values(";
        public string insertwz = "insert into mzinewz values(";

        public Form1()
        {
            InitializeComponent();
        }

        /// <summary>
        ///  保存网页
        /// </summary>
        /// <param name="FILE_NAME">文件的路径</param>
        /// <param name="data">数据</param>
        public void TextToFile(string FILE_NAME, string data)
        {
            if (File.Exists(FILE_NAME))
            {
                return;
            }
            using (StreamWriter sw = File.CreateText(FILE_NAME))
            {
                sw.Write(data);
                sw.Close();
            }
        }
        /// <summary>
        ///  下载文件
        /// </summary>
        /// <param name="PageUrl">网址</param>
        /// <param name="filename">保存文件路径</param>
        public void DownFile(string PageUrl, string filename)
        {
            if (!Directory.Exists(filename))
            {
                Directory.CreateDirectory(filename);
            }
            string  path = PageUrl.Substring(PageUrl.LastIndexOf("/") + 1);
            string dirname = filename + "//" + path;


            if (File.Exists(dirname))
            {
                return;
            }
            else
            {
                try
                {                   
                    WebClient wc = new WebClient();
                    WebProxy wp = new WebProxy(al[proxy].ToString(), true);
                    wc.Proxy = wp;
                    wc.DownloadFile(PageUrl, dirname);
                }
                catch (WebException ex)
                {
                    if (ex.Status == WebExceptionStatus.ConnectFailure)
                    {
                        //无法连接到远程服务器, --换代理 IP
                        //MessageBox.Show(ex.ToString());
                        proxy++;
                        if (proxy >= al.Count)
                        {
                            al = ReadIPproxy("e://test.txt");//初始化代理   IP
                        }
                        DownFile(PageUrl, filename);
                    }
                    else if (ex.Status == WebExceptionStatus.Timeout)
                    {
                        //超时 --换代理 IP
                        //MessageBox.Show(ex.ToString());
                        proxy++;
                        if (proxy >= al.Count)
                        {
                            al = ReadIPproxy("e://test.txt");//初始化代理   IP
                        }
                        DownFile(PageUrl, filename);
                    }
                    else if (ex.Status == WebExceptionStatus.ProtocolError)
                    {
                        //文件未找到--跳出
                        //MessageBox.Show(ex.ToString());
                        return;
                    }
                }
            }

        }
        /// <summary>
        /// 读文件
        /// </summary>
        /// <param name="FILE_NAME">文件的路径</param>
        /// <returns>数据</returns>
        public ArrayList ReadIPproxy(string FILE_NAME)
        {   
            using (StreamReader sr = File.OpenText(FILE_NAME))
            {
                String input;
                while ((input = sr.ReadLine()) != null)
                {
                    al.Add(input);
                }         
                sr.Close();
            }
            return al;

        }
        /// <summary>
        /// 数据库
        /// </summary>
        public void Executesql()
        {
            SqlHelper.ExecuteNonQuery(SqlHelper.sqlstr, CommandType.Text, sbs.ToString(), null);
        }
        /// <summary>
        /// 读文件
        /// </summary>
        /// <param name="FILE_NAME">文件的路径</param>
        /// <returns>数据</returns>
        public string FileToText(string FILE_NAME)
        {
            string data;
            using (StreamReader sr = File.OpenText(FILE_NAME))
            {
                data=sr.ReadToEnd();
                sr.Close();
            }
            return data;
        }
        /// <summary>
        /// 保存SQL
        /// </summary>
        /// <param name="sql"></param>
        public void SaveSqls(string sql)
        {
            sbs.Append(sql).Append("/n");
        }   
        /// <summary>
        ///  请求失败的时候,反复操作
        /// </summary>
        /// <param name="PageUrl"></param>
        /// <returns></returns>
        public string ToServer(string PageUrl)
        {
            string responseFromServer = "";
         
            try
            {               
                while (1 == 1)
                {
                    WebRequest request = WebRequest.Create(PageUrl);
                    WebProxy wp = new WebProxy(al[proxy].ToString(), true);
                    request.Proxy = wp;
                    request.Timeout = 1000 * 60;

                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                    Stream dataStream = response.GetResponseStream();
                    StreamReader reader=null;
                    try
                    {
                        reader = new StreamReader(dataStream, System.Text.Encoding.Default);
                        responseFromServer = reader.ReadToEnd();
                    }
                    catch
                    {
                        proxy++;
                        if (proxy >= al.Count)
                        {
                            al = ReadIPproxy("e://test.txt");//初始化代理   IP
                        }
                        ToServer(PageUrl);
                    };
                    reader.Close();
                    dataStream.Close();
                    response.Close();
                    if (responseFromServer.Contains("refresh") || responseFromServer == "")
                    {
                        proxy++;
                        if (proxy >= al.Count)
                        {
                            al = ReadIPproxy("e://test.txt");//初始化代理   IP
                        }
                        //ToServer(PageUrl);
                    }
                    else
                    {
                        break;
                    }
                }
            }
            catch (WebException ex)
            {               
                if (ex.Status == WebExceptionStatus.ProtocolError)
                {                  
                    responseFromServer = "";
                }
                else
                {
                    proxy++;
                    if (proxy >= al.Count)
                    {
                        al = ReadIPproxy("e://test.txt");//初始化代理   IP
                    }
                    ToServer(PageUrl);
                }
            }          
            return responseFromServer;
        }
        /// <summary>
        /// 保存XML 文件
        /// </summary>
        public void SaveXmls()
        {
            string pathxml = "";
            foreach (Class1 c in cls)
            {
                Class1 s = c;
                pathxml = s.address;

                if (!File.Exists(pathxml))
                {
                    XmlSerializer xs = new XmlSerializer(typeof(Class1));
                    Stream stream = new FileStream(pathxml, FileMode.Create, FileAccess.Write, FileShare.ReadWrite);
                    xs.Serialize(stream, s);
                    stream.Close();
                }            
              
            }
        }
        /// <summary>
        ///  移除HTMl 标记
        /// </summary>
        /// <param name="Html"></param>
        /// <param name="RegStr"></param>
        /// <returns></returns>
        public static string Remove(string Html)
        {
            //Regex Reg = new Regex(RegStr);
            //foreach (Match m in Reg.Matches(Html))
            //{
            //    Html = Html.Replace(m.Value, "");
            //}
            //return Html.Trim();
            string regesstr = "<.*?>";
            return Regex.Replace(Html, regesstr, string.Empty, RegexOptions.IgnoreCase);
        }
        public static string FilterScript(string content)
        {
            string regexstr = @"<(script)[^>]*>(/s*|.)*<//1>";
            return Regex.Replace(content,regexstr,string.Empty,RegexOptions.IgnoreCase);

        }
        /// <summary>
        /// 过略所有的 危险标记
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        public string wipeScript(string html)
        {
            System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"(<script){1,}[^<>]*>[^/0]*(<//script>){1,}", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@"href   *=   *[/s/S]*script   *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@"on[/s/S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[/s/S]+</iframe*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[/s/S]+</frameset*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            html = regex1.Replace(html, "");   //过滤<script></script>标记  
            html = regex2.Replace(html, "");   //过滤href=javascript:   (<A>)   属性  
            html = regex3.Replace(html, "   _disibledevent=");   //过滤其它控件的on...事件  
            html = regex4.Replace(html, "");   //过滤iframe  
            html = regex5.Replace(html, "");   //过滤frameset  
            return html;
        }
        public void HtmlSource(string urlpri)
        {
            //要写入的文件路径
            filename = "E://观2//magazine.html";

            if (!Directory.Exists("E://观2"))
            {
                Directory.CreateDirectory("E://观2");
            }
            if (File.Exists(filename))
            {
                responseFromServer=FileToText(filename); //存在
               
            }
            else
            {
                responseFromServer = ToServer(urlpri); //不存在
               
            }
            sum++;
            if (responseFromServer != "")
            {
                //分析内容
                TextToFile(filename,responseFromServer);

                MatchCollection mc = Regex.Matches(responseFromServer, @"href=""/magazine/(.*)""><b>(.*)</b>", RegexOptions.IgnoreCase);
                foreach (Match m in mc)
                {
                    newurl = m.Groups[1].Value;
                    dirname = m.Groups[2].Value;

                    int key = ++keyi;
                    sql = insertdl + key + ",'" + dirname + "')";
                    SaveSqls(sql);

                    cururl = urlpri + newurl;
                    curdir = "E://观2//" + dirname;

                    one(cururl, curdir,key);
                }
                SaveXmls();
                Executesql();               
               
                this.textBox1.Text = sum.ToString();
                MessageBox.Show("采集成功!");
            }
        }
        public void one(string urlpri,string _dirname,int _key)
        {
            //要写入的文件路径
            filename = _dirname +"//"+ urlpri.Substring(urlpri.LastIndexOf("/") + 1);

            if (!Directory.Exists(_dirname))
            {
                Directory.CreateDirectory(_dirname);
            }
            if (File.Exists(filename))
            {
                responseFromServer = FileToText(filename);
            }
            else
            {
                responseFromServer = ToServer(urlpri);
            }
            sum++;
            if (responseFromServer != "")
            {
                TextToFile(filename, responseFromServer);

                MatchCollection mc = Regex.Matches(responseFromServer, @"href=""/././(.*list.html)""[/s/S]*?《(.*?)》", RegexOptions.IgnoreCase);

                foreach (Match m in mc)
                {
                    newurl = m.Groups[1].Value;
                    dirname = m.Groups[2].Value;

                    cururl = "http://www.zydg.net/magazine/" + newurl;
                    curdir = _dirname + "//" + dirname;

                    two(cururl, curdir, _key);

                }                              
            }          
        }
        public void two(string urlpri,string _dirname,int _key)
        {
            filename = urlpri.Substring(0, urlpri.LastIndexOf("/"));
            filename = filename.Substring(filename.LastIndexOf("/") + 1) + ".html";
            filename = _dirname + "//" + filename;

            if (!Directory.Exists(_dirname))
            {
                Directory.CreateDirectory(_dirname);
            }

            if (File.Exists(filename))
            {
                responseFromServer = FileToText(filename);
            }
            else
            {
                responseFromServer = ToServer(urlpri);
            }
            sum++;
            if (responseFromServer != "")
            {
                TextToFile(filename, responseFromServer);

                Match mc = Regex.Match(responseFromServer, @"刊/s+期:(.*?)<br>[/s/S]*?编/s+辑:(.*?)<br>[/s/S]*?出/s+版: (.*?)<br>[/s/S]*?联系电话:(.*?)<br>[/s/S]*?E-mail: (.*?)<br>[/s/S]*?社/s+址:(.*?)<br>[/s/S]*?邮/s+编: (.*?)<br>[/s/S]*?邮发代号:(.*?)<br>[/s/S]*?国外发行代号: (.*?)<br>[/s/S]*?国际标准刊号:(.*?)<br>[/s/S]*?国内统一刊号: (.*?)</td>", RegexOptions.IgnoreCase);
                Match content = Regex.Match(responseFromServer, @"刊/s+物/s+简/s+介/s+:::...([/s/S]*?)...:::/s+收录期号列表", RegexOptions.Multiline);
                int key = ++keyj;
                sql = insertxl + keyj + "," + _key + ",'" + dirname + "','" + mc.Groups[1].Value + "','" + mc.Groups[2].Value + "','" +
                       mc.Groups[3].Value + "','" + mc.Groups[4].Value + "','" + mc.Groups[5].Value + "','" + mc.Groups[6].Value + "','" +
                       mc.Groups[7].Value + "','" + mc.Groups[8].Value + "','" + mc.Groups[9].Value + "','" + mc.Groups[10].Value + "','" + mc.Groups[11].Value + "','" + Remove(content.Groups[1].Value) + "')";
                SaveSqls(sql);

                MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(.*?)'/s+target.*>(.*?)</a>", RegexOptions.IgnoreCase);
                foreach (Match m2 in mc2)
                {
                    newurl = m2.Groups[1].Value;
                    dirname = m2.Groups[2].Value.Replace("年", "-").Replace("第", "").Replace("期", "");

                    cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl;
                    curdir = _dirname + "//" + dirname;

                    three(cururl, curdir,key,dirname);
                }
               
            }          
        }
        public void three(string urlpri,string _dirname,int _key,string qishu)
        {
            //要写入的文件路径
            filename = _dirname + "//" + urlpri.Substring(urlpri.LastIndexOf("/") + 1);

            if (!Directory.Exists(_dirname))
            {
                Directory.CreateDirectory(_dirname);
            }

            if (File.Exists(filename))
            {
                responseFromServer = FileToText(filename);
            }
            else
            {
                responseFromServer = ToServer(urlpri);
            }
            sum++;
            if (responseFromServer != "")
            {
                TextToFile(filename, responseFromServer);

                Match m = Regex.Match(responseFromServer, @"src='face_(.*?)'", RegexOptions.IgnoreCase);
                string photoName = "";
                if (m.Groups[1].Value.Trim() != "")
                {
                    photoName = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) +"face_" + m.Groups[1].Value;
                    DownFile(photoName, _dirname);
                  
                }
                int key = ++keym;
                sql = insertinfo + key + "," + _key + ",'" + qishu + "','" + _dirname +"//"+ "face_" + m.Groups[1].Value + "')";
                SaveSqls(sql);

                MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(/d+.html?)'[/s/S]*?<font/s+color=black>(.*?)</a>|& lt;font[^>]*?>[(.+?)]", RegexOptions.IgnoreCase);
                foreach (Match m2 in mc2)
                {
                    newurl = m2.Groups[1].Value;

                    string muName = m2.Groups[3].Value;
                    if (muName == "")
                    {
                        muName = mulu;
                    }
                    string lstr = m2.Groups[2].Value;
                    string s1 = "";
                    string s2 = "";
                    if (lstr != "")
                    {
                        if (lstr.Contains("."))
                        {
                            s1 = lstr.Substring(0, lstr.IndexOf("."));
                            s2 = lstr.Substring(lstr.LastIndexOf(".") + 1);
                        }
                        else
                        {
                            s1 = lstr;
                            s2 = "";
                        }
                        int k2 = ++keyn;
                        sql = insertwz + k2 + "," + key + ",'" + muName + "','" + s1 + "','" + s2 + "')";
                        SaveSqls(sql);

                        cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl;
                        curdir = _dirname;
                        four(cururl, curdir,k2);

                    }
                    mulu = muName;
                }              
            }
        }      
        public void four(string urlpri,string _dirname,int _key)
        {
            filename = _dirname + "//" + urlpri.Substring(urlpri.LastIndexOf("/") + 1);

            if (!Directory.Exists(_dirname))
            {
                Directory.CreateDirectory(_dirname);
            }
            if (File.Exists(filename))
            {
                responseFromServer = FileToText(filename);
            }
            else
            {
                responseFromServer = ToServer(urlpri);
            }
            sum++;
            if (responseFromServer != "")
            {
                TextToFile(filename, responseFromServer);

                //分析内容
                Match m = Regex.Match(responseFromServer, @"正文开始-->(?<text>[/s/S]*?)<!--正文结束", RegexOptions.IgnoreCase);
                string content = m.Groups["text"].Value; //得到正文的所有内容
                string c = FilterScript(content);
                c = Remove(c);  //得到过滤后的正文内容
               // Match ms = Regex.Match(c, @"正文开始-->(?<text>[/s/S]*?)<!--正文结束", RegexOptions.IgnoreCase);
              
              
                //设置要保存的XML 文件的名称
                string xmlname = urlpri.Substring(urlpri.LastIndexOf("/") + 1, urlpri.LastIndexOf(".") - urlpri.LastIndexOf("/"));
                string pathxml = _dirname + "//" + xmlname + "xml";  //将路径 和名字一起传过去

                Class1 cs = new Class1(_key, c, pathxml);
                cls.Add(cs);
                //序列化成功
                MatchCollection mc = Regex.Matches(responseFromServer, @"(<img/s+src=""(?<imgs>.*)""/s+hspace|HreF=""([^>]*PDF)"")", RegexOptions.IgnoreCase);
                foreach (Match m2 in mc)
                {
                    string imgurl = m2.Groups["imgs"].Value.Trim(); //得到单个图片的名称
                    string zhuurl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1);
                    if (imgurl != "")
                    {
                        string jurl = zhuurl + imgurl; //得到图片的绝对路径
                        DownFile(jurl, _dirname);
       
                    }
                    string pdfurl = m2.Groups["pdfs"].Value.Trim(); //得到单个PDF 的名称
                    if (pdfurl != "")
                    {
                        string jurl = zhuurl + pdfurl; //得到 pdf 的绝对路径  
                        DownFile(jurl, _dirname);
                                         
                    }
                }
            }
        }  
        private void btnOK_Click(object sender, EventArgs e)
        {
            al = ReadIPproxy("e://test.txt");//初始化代理   IP
            HtmlSource("http://www.zydg.net/magazine/");            
        }

        private void button1_Click(object sender, EventArgs e)
        {
            Application.Exit();           
        }
      
    }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值