天涯文章采集器

using System;

using System.Collections.Generic;

using System.Text;

using lintTools;

using System.Web;

using System.Net;

using System.IO;

using System.Text.RegularExpressions;

using System.Collections;



namespace ty

{

    class Program

    {

        static void Main(string[] args)

        {

            SortedList[] arrTitle;

            string content="",writer="",title="",nextUrl="";

            LintSys.WriteLine("********************天涯文章下载大师v0.1 Copyright By lintg.200801***********",ConsoleColor.Yellow);

            LintSys.WriteLine("-s:单篇文章地址,默认取文章名称为下载文件名称,默认下载多页", ConsoleColor.Yellow);



            LintSys.WriteLine("-u:下载页面的首页,默认取天涯舞文弄墨首页", ConsoleColor.Yellow);

            LintSys.WriteLine("-p:下载页数,默认取100页", ConsoleColor.Yellow);

            LintSys.WriteLine("-d:下载目录,默认取当前目录", ConsoleColor.Yellow);

            LintSys.WriteLine("-f:过滤文件大小,默认过滤4000字节以下文件", ConsoleColor.Yellow);



            string downMode = "m",url="http://cache.tianya.cn/pub/list/0/culture.shtml",downDir="./",sUrl="";

            int downPage = 100,filterBytes=4000;

            Regex regex=new Regex(@"-(?<paramName>[supdf]):(?<paramValue>/S*?)___FCKpd___0quot;,RegexOptions.Singleline);

            for(int i=0;i<args.Length;i++)

            {

              //  Console.WriteLine(args[i]);

                Match m=regex.Match(args[i]);

                if(m.Success)

                {

                    //Console.WriteLine(m.Result("${paramName}") + m.Result("${paramValue}"));

                    switch(m.Result("${paramName}"))

                    {

                        case "s":

                            downMode="s";

                            sUrl=m.Result("${paramValue}");

                            Console.WriteLine(sUrl);

                            break;

                        case "u":

                            url=m.Result("${paramValue}");

                            break;

                        case "p":

                            try{

                                downPage=int.Parse(m.Result("${paramValue}").ToString());

                            }

                            catch

                            {

                                Console.WriteLine("参数错误,-p:页数");

                            }

                            break;

                        case "d":

                            downDir=m.Result("${paramValue}").ToString();

                            break;

                            case "f":

                             try{

                                     filterBytes=int.Parse(m.Result("${paramValue}").ToString());

                                }

                            catch

                             {

                                Console.WriteLine("参数错误,-f:过滤文件大小");

                            }

                            break;



                    }



                }

            }

            

            switch(downMode)

            {

                case "s":

                     GetAuthor(sUrl,ref writer,ref title);

                     content = GetArticle(sUrl,writer);

                     LintSys.WriteFile(downDir+title + ".txt", content);

                    break;

                default:

                    

             

            for(int j=0;j<downPage;j++)

            {

                LintSys.WriteLine("第" + (j + 1).ToString() + "页:"+url, ConsoleColor.Red);

                arrTitle = GetTitle(url, ref nextUrl);

               // return;

                for (int i = 1; i < arrTitle.Length-1; i++)

                {                    

                    LintSys.WriteLine((j + 1).ToString() + "-"+i.ToString()+",下载文章:"+arrTitle[i]["title"].ToString()+",作者:"+arrTitle[i]["writer"],ConsoleColor.Green);

                     content = GetArticle( arrTitle[i]["url"].ToString(), arrTitle[i]["writer"].ToString());

                     if(content.Length>filterBytes)

                        LintSys.WriteFile(arrTitle[i]["title"].ToString() + ".txt", content,FileMode.Create);

                     else

                         Console.WriteLine("文件:"+content.Length.ToString()+"<"+filterBytes.ToString()+",被过滤");

                }



                if (nextUrl != null)

                {

                    url = nextUrl;

                }

                else

                {

                    break;

                }

            }

            break;



            }



           

            

         

            //content = GetArticle("http://cache.tianya.cn/publicforum/Content/culture/1/245493.shtml", "不愿当好人");

            //LintSys.WriteFile("write.txt", content);



        }

        static bool GetAuthor(string url,ref string writer,ref string title)

        {

            CookieContainer cc = new CookieContainer();

            string content = Net.GetContent(url, ref cc);

            Regex regex = new Regex("<TITLE>(?<title>.*?)</TITLE>",RegexOptions.Singleline);

            Match m = regex.Match(content);

            if (m.Success)

            {

                title = m.Result("${title}");

            }

            regex = new Regex(@"作者:<a .*?>(?<writer>.*?)</a>", RegexOptions.Singleline);

            m = regex.Match(content);

            if (m.Success)

            {

                writer = m.Result("${writer}");

            }

            return true;

        }



        static string GetArticle(string url,string writer)

        {

            string content,filterContent="",replyContent;

            int j=0;

            CookieContainer cc = new CookieContainer();

            while (true)

            {

                Console.WriteLine("连接" + url + ".....");

                content = Net.GetContent(url, ref cc);

                Regex regex = new Regex(writer.Replace("*","//*") + "</a>.*?</table>(?<content>.*?)(<TABLE)", RegexOptions.Singleline);

                MatchCollection mc = regex.Matches(content);

                LintSys.WriteLine("匹配回帖:" + mc.Count.ToString(), ConsoleColor.Yellow);

           //     Console.WriteLine(mc.Count.ToString() + regex.ToString());

                                

                    for (int i = 0; i < mc.Count; i++)

                    {

                        replyContent=mc[i].Result("${content}").Trim();

                        if (replyContent.Length>50&&replyContent.Substring(0, 2) != "作者"  || replyContent.Length > 100&&replyContent.Substring(0, 2) == "作者" )  //回帖字数超过30认为有效

                        {

                            filterContent += "(" + (j++).ToString() + ")/n" + replyContent;

                        }

                    }

                

                regex = new Regex(@"<a /S*? href=(?<url>/S*?)>下一页</a>",RegexOptions.Singleline);

                Match m = regex.Match(content);

                if (!m.Success)

                    break;

                else

                    url = m.Result("${url}");

            }

            LintSys.WriteLine("下载完成....",ConsoleColor.DarkGreen);

            return Trans.ReplaceHtml(filterContent);

        }

        



        static SortedList[] GetTitle(string url,ref string nextUrl)

        {

            SortedList[] title;

            string content = "";

            CookieContainer cc = new CookieContainer();

            content = Net.GetContent(url, ref cc);

           // Console.WriteLine(content);

            LintSys.WriteFile("log.txt", content);

            //Regex regex = new Regex(@"<a href='(?<url>http://cache.tianya.cn/publicforum/S*)'.*?>(?<title>.*?)<.*?vwriter=(?<writer>)'.*?", RegexOptions.Singleline);

            Regex regex = new Regex(@"<a href='(?<url>http://cache.tianya.cn/publicforum/content/S*)'.*?>(?<title>.*?)<.*?vwriter=(?<writer>.*?)'", RegexOptions.Singleline);

            MatchCollection mc = regex.Matches(content);

           // Console.WriteLine(mc.Count.ToString());

           // return null;

            title = new SortedList[mc.Count ];

            for (int i = 0; i < mc.Count-1; i++) 

            {

                title[i] = new SortedList();

                title[i]["url"] = mc[i].Result("${url}");

                title[i]["title"] = (new Regex(Reg.dirStr)).Replace(mc[i].Result("${title}"),"");

                title[i]["writer"] = mc[i].Result("${writer}");

                //Console.WriteLine(mc[i].Result("${url}") + mc[i].Result("${title}") + mc[i].Result("${writer}"));

            }

            title[0] = new SortedList();

            regex=new Regex(@"<a href=(?<url>/S*)?>下一页</a>",RegexOptions.Singleline);

            Match m = regex.Match(content);

            if (m.Success)

            {

                nextUrl = m.Result("${url}");               

            }

            return title;

        }

    }

}

http://info95.vicp.net/info95/non-cgi/usr/5/5_6.rar
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值