c#版 网站采集


namespace CJ
{
    public partial class Form1 : Form
    {
        StringBuilder sbs = new StringBuilder();
        List<Class1> cls = new List<Class1>();
        public ArrayList al = new ArrayList();

        /// <summary>
        /// 保存网页
        /// </summary>
        /// <param name="FILE_NAME">文件的路径</param>
        /// <param name="data">数据</param>
        public void TextToFile(string FILE_NAME, string data)
        {
            if (File.Exists(FILE_NAME))
            {
                return;
            }
            using (StreamWriter sw = File.CreateText(FILE_NAME))
            {
                sw.Write(data);
                sw.Close();
            }
        }
        /// <summary>
        /// 下载文件
        /// </summary>
        /// <param name="PageUrl">网址</param>
        /// <param name="filename">保存文件路径</param>
        public void DownFile(string PageUrl, string filename)
        {
            if (!Directory.Exists(filename))
            {
                Directory.CreateDirectory(filename);
            }
            string path = PageUrl.Substring(PageUrl.LastIndexOf("/") + 1);
            string dirname = filename + "\\" + path;


            if (File.Exists(dirname))
            {
                return;
            }
            else
            {
                try
                {                   
                    WebClient wc = new WebClient();
                    WebProxy wp = new WebProxy(al[proxy].ToString(), true);
                    wc.Proxy = wp;
                    wc.DownloadFile(PageUrl, dirname);
                }
                catch (WebException ex)
                {                  
                    if (ex.Status == WebExceptionStatus.ProtocolError)
                    {
                        //文件未找到--跳出
                        //MessageBox.Show(ex.ToString());
                        return;
                    }
                    else
                    {
                        //换代理 IP
                        //MessageBox.Show(ex.ToString());
                        proxy++;
                        if (proxy >= al.Count)
                        {
                            al = ReadIPproxy("e:\\test.txt");//初始化代理   IP
                        }
                        DownFile(PageUrl, filename);
                    }
                }
            }

        }
        /// <summary>
        /// 读文件
        /// </summary>
        /// <param name="FILE_NAME">文件的路径</param>
        /// <returns>数据</returns>
        public ArrayList ReadIPproxy(string FILE_NAME)
        {   
            using (StreamReader sr = File.OpenText(FILE_NAME))
            {
                String input;
                while ((input = sr.ReadLine()) != null)
                {
                    al.Add(input);
                }         
                sr.Close();
            }
            return al;

        }
        /// <summary>
        /// 数据库
        /// </summary>
        public void Executesql()
        {
            SqlHelper.ExecuteNonQuery(SqlHelper.sqlstr, CommandType.Text, sbs.ToString(), null);
        }
        /// <summary>
        /// 读文件
        /// </summary>
        /// <param name="FILE_NAME">文件的路径</param>
        /// <returns>数据</returns>
        public string FileToText(string FILE_NAME)
        {
            string data;
            using (StreamReader sr = File.OpenText(FILE_NAME))
            {
                data=sr.ReadToEnd();
                sr.Close();
            }
            return data;
        }    
        /// <summary>
        /// 请求失败的时候,反复操作
        /// </summary>
        /// <param name="PageUrl"></param>
        /// <returns></returns>
        public string ToServer(string PageUrl)
        {
            string responseFromServer = "";
         
            try
            {               
                while (1 == 1)
                {
                    WebRequest request = WebRequest.Create(PageUrl);
                    WebProxy wp = new WebProxy(al[proxy].ToString(), true);
                    request.Proxy = wp;
                    request.Timeout = 1000 * 45;

                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                    Stream dataStream = response.GetResponseStream();
                    StreamReader reader=null;                
                    reader = new StreamReader(dataStream, System.Text.Encoding.Default);
                    responseFromServer = reader.ReadToEnd();                   
                    reader.Close();
                    dataStream.Close();
                    response.Close();
                    if (responseFromServer.Contains("refresh") || responseFromServer == "")
                    {
                        proxy++;
                        if (proxy >= al.Count)
                        {
                            al = ReadIPproxy("e:\\test.txt");//初始化代理   IP
                        }                
                    }
                    else
                    {
                        break;
                    }
                }
            }
            catch (WebException ex)
            {               
                if (ex.Status == WebExceptionStatus.ProtocolError)
                {                  
                    responseFromServer = "";
                }
                else
                {
                    proxy++;
                    if (proxy >= al.Count)
                    {
                        al = ReadIPproxy("e:\\test.txt");//初始化代理   IP
                    }
                    ToServer(PageUrl);
                }
            }          
            return responseFromServer;
        }
        /// <summary>
        /// 保存XML 文件
        /// </summary>
        public void SaveXmls()
        {
            string pathxml = "";
            foreach (Class1 c in cls)
            {
                Class1 s = c;
                pathxml = s.address;

                if (!File.Exists(pathxml))
                {
                    XmlSerializer xs = new XmlSerializer(typeof(Class1));
                    Stream stream = new FileStream(pathxml, FileMode.Create, FileAccess.Write, FileShare.ReadWrite);
                    xs.Serialize(stream, s);
                    stream.Close();
                }            
              
            }
        }
        /// <summary>
        /// 移除HTMl 标记
        /// </summary>
        /// <param name="Html"></param>
        /// <param name="RegStr"></param>
        /// <returns></returns>
        public static string Remove(string Html)
        {
            string regesstr = "<.*?>";
            return Regex.Replace(Html, regesstr, string.Empty, RegexOptions.IgnoreCase);
        }
        public static string FilterScript(string content)
        {
            string regexstr = @"<(script)[^>]*>(\s*|.)*</\1>";
            return Regex.Replace(content,regexstr,string.Empty,RegexOptions.IgnoreCase);

        }
        public void HtmlSource(string urlpri)
        {
            //要写入的文件路径
            filename = ".........\\magazine.html";

            if (!Directory.Exists("E:\\...."))
            {
                Directory.CreateDirectory("E:............");
            }
            if (File.Exists(filename))
            {
                responseFromServer=FileToText(filename); //存在               
            }
            else
            {
                responseFromServer = ToServer(urlpri); //不存在               
            }
            sum++;
            if (responseFromServer != "")
            {
                //分析内容
                TextToFile(filename,responseFromServer);

                MatchCollection mc = Regex.Matches(responseFromServer, @"href=""/magazine/(.*)""><b>(.*)</b>", RegexOptions.IgnoreCase);
                foreach (Match m in mc)
                {
                    newurl = m.Groups[1].Value;
                    dirname = m.Groups[2].Value;
                    int key = ++keyi;                
                    cururl = urlpri + newurl;
                    curdir = "E:\\........\\" + dirname;
                    one(cururl, curdir,key);
                }
                SaveXmls();            
            }
        }
        public void one(string urlpri,string _dirname,int _key)
        {
            //要写入的文件路径
            filename = _dirname +"\\"+ urlpri.Substring(urlpri.LastIndexOf("/") + 1);

            if (!Directory.Exists(_dirname))
            {
                Directory.CreateDirectory(_dirname);
            }
            if (File.Exists(filename))
            {
                responseFromServer = FileToText(filename);
            }
            else
            {
                responseFromServer = ToServer(urlpri);
            }
            sum++;
            if (responseFromServer != "")
            {
                TextToFile(filename, responseFromServer);
                MatchCollection mc = Regex.Matches(responseFromServer, @"href=""\.\./(.*list.html)""[\s\S]*?《(.*?)》", RegexOptions.IgnoreCase);
                foreach (Match m in mc)
                {
                    newurl = m.Groups[1].Value;
                    dirname = m.Groups[2].Value;
                    cururl = "http://www......net/............./" + newurl;
                    curdir = _dirname + "\\" + dirname;
                    two(cururl, curdir, _key);
                }                              
            }          
        }       
        public void four(string urlpri,string _dirname,int _key)
        {
            filename = _dirname + "\\" + urlpri.Substring(urlpri.LastIndexOf("/") + 1);
            if (!Directory.Exists(_dirname))
            {
                Directory.CreateDirectory(_dirname);
            }
            if (File.Exists(filename))
            {
                responseFromServer = FileToText(filename);
            }
            else
            {
                responseFromServer = ToServer(urlpri);
            }
            sum++;
            if (responseFromServer != "")
            {
                TextToFile(filename, responseFromServer);
                //分析内容
                Match m = Regex.Match(responseFromServer, @"正文开始-->(?<text>[\s\S]*?)<!--正文结束", RegexOptions.IgnoreCase);
                string content = m.Groups["text"].Value; //得到正文的所有内容
                string c = FilterScript(content);
                c = Remove(c); //得到过滤后的正文内容
               // Match ms = Regex.Match(c, @"正文开始-->(?<text>[\s\S]*?)<!--正文结束", RegexOptions.IgnoreCase);              
              
                //设置要保存的XML 文件的名称
                string xmlname = urlpri.Substring(urlpri.LastIndexOf("/") + 1, urlpri.LastIndexOf(".") - urlpri.LastIndexOf("/"));
                string pathxml = _dirname + "\\" + xmlname + "xml"; //将路径 和名字一起传过去
                Class1 cs = new Class1(_key, c, pathxml);
                cls.Add(cs);
                //序列化成功
                MatchCollection mc = Regex.Matches(responseFromServer, @"(<img\s+src=""(?<imgs>.*?)""\s+hspace.+?>|HreF=""([^>]*PDF)"")", RegexOptions.IgnoreCase);
                foreach (Match m2 in mc)
                {
                    string imgurl = m2.Groups["imgs"].Value.Trim(); //得到单个图片的名称
                    string zhuurl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1);
                    if (imgurl != "")
                    {
                        string jurl = zhuurl + imgurl; //得到图片的绝对路径
                        DownFile(jurl, _dirname);       
                    }
                    string pdfurl = m2.Groups[2].Value.Trim(); //得到单个PDF 的名称
                    if (pdfurl != "")
                    {
                        string jurl = zhuurl + pdfurl; //得到 pdf 的绝对路径  
                        DownFile(jurl, _dirname);
                                         
                    }
                }
            }
        }  
        private void btnOK_Click(object sender, EventArgs e)
        {
            al = ReadIPproxy("e:\\test.txt");//初始化代理   IP
            HtmlSource("http://www.....net/........../");  
        }                   
    }
}

转载于:https://www.cnblogs.com/qinhaijun/archive/2011/08/26/2154434.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值