using System;
using System.Collections.Generic;
using System.Text;
using lintTools;
using System.Web;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
namespace ty
{
class Program
{
static void Main(string[] args)
{
SortedList[] arrTitle;
string content="",writer="",title="",nextUrl="";
LintSys.WriteLine("********************天涯文章下载大师v0.1 Copyright By lintg.200801***********",ConsoleColor.Yellow);
LintSys.WriteLine("-s:单篇文章地址,默认取文章名称为下载文件名称,默认下载多页", ConsoleColor.Yellow);
LintSys.WriteLine("-u:下载页面的首页,默认取天涯舞文弄墨首页", ConsoleColor.Yellow);
LintSys.WriteLine("-p:下载页数,默认取100页", ConsoleColor.Yellow);
LintSys.WriteLine("-d:下载目录,默认取当前目录", ConsoleColor.Yellow);
LintSys.WriteLine("-f:过滤文件大小,默认过滤4000字节以下文件", ConsoleColor.Yellow);
string downMode = "m",url="http://cache.tianya.cn/pub/list/0/culture.shtml",downDir="./",sUrl="";
int downPage = 100,filterBytes=4000;
Regex regex=new Regex(@"-(?<paramName>[supdf]):(?<paramValue>/S*?)___FCKpd___0quot;,RegexOptions.Singleline);
for(int i=0;i<args.Length;i++)
{
// Console.WriteLine(args[i]);
Match m=regex.Match(args[i]);
if(m.Success)
{
//Console.WriteLine(m.Result("${paramName}") + m.Result("${paramValue}"));
switch(m.Result("${paramName}"))
{
case "s":
downMode="s";
sUrl=m.Result("${paramValue}");
Console.WriteLine(sUrl);
break;
case "u":
url=m.Result("${paramValue}");
break;
case "p":
try{
downPage=int.Parse(m.Result("${paramValue}").ToString());
}
catch
{
Console.WriteLine("参数错误,-p:页数");
}
break;
case "d":
downDir=m.Result("${paramValue}").ToString();
break;
case "f":
try{
filterBytes=int.Parse(m.Result("${paramValue}").ToString());
}
catch
{
Console.WriteLine("参数错误,-f:过滤文件大小");
}
break;
}
}
}
switch(downMode)
{
case "s":
GetAuthor(sUrl,ref writer,ref title);
content = GetArticle(sUrl,writer);
LintSys.WriteFile(downDir+title + ".txt", content);
break;
default:
for(int j=0;j<downPage;j++)
{
LintSys.WriteLine("第" + (j + 1).ToString() + "页:"+url, ConsoleColor.Red);
arrTitle = GetTitle(url, ref nextUrl);
// return;
for (int i = 1; i < arrTitle.Length-1; i++)
{
LintSys.WriteLine((j + 1).ToString() + "-"+i.ToString()+",下载文章:"+arrTitle[i]["title"].ToString()+",作者:"+arrTitle[i]["writer"],ConsoleColor.Green);
content = GetArticle( arrTitle[i]["url"].ToString(), arrTitle[i]["writer"].ToString());
if(content.Length>filterBytes)
LintSys.WriteFile(arrTitle[i]["title"].ToString() + ".txt", content,FileMode.Create);
else
Console.WriteLine("文件:"+content.Length.ToString()+"<"+filterBytes.ToString()+",被过滤");
}
if (nextUrl != null)
{
url = nextUrl;
}
else
{
break;
}
}
break;
}
//content = GetArticle("http://cache.tianya.cn/publicforum/Content/culture/1/245493.shtml", "不愿当好人");
//LintSys.WriteFile("write.txt", content);
}
static bool GetAuthor(string url,ref string writer,ref string title)
{
CookieContainer cc = new CookieContainer();
string content = Net.GetContent(url, ref cc);
Regex regex = new Regex("<TITLE>(?<title>.*?)</TITLE>",RegexOptions.Singleline);
Match m = regex.Match(content);
if (m.Success)
{
title = m.Result("${title}");
}
regex = new Regex(@"作者:<a .*?>(?<writer>.*?)</a>", RegexOptions.Singleline);
m = regex.Match(content);
if (m.Success)
{
writer = m.Result("${writer}");
}
return true;
}
static string GetArticle(string url,string writer)
{
string content,filterContent="",replyContent;
int j=0;
CookieContainer cc = new CookieContainer();
while (true)
{
Console.WriteLine("连接" + url + ".....");
content = Net.GetContent(url, ref cc);
Regex regex = new Regex(writer.Replace("*","//*") + "</a>.*?</table>(?<content>.*?)(<TABLE)", RegexOptions.Singleline);
MatchCollection mc = regex.Matches(content);
LintSys.WriteLine("匹配回帖:" + mc.Count.ToString(), ConsoleColor.Yellow);
// Console.WriteLine(mc.Count.ToString() + regex.ToString());
for (int i = 0; i < mc.Count; i++)
{
replyContent=mc[i].Result("${content}").Trim();
if (replyContent.Length>50&&replyContent.Substring(0, 2) != "作者" || replyContent.Length > 100&&replyContent.Substring(0, 2) == "作者" ) //回帖字数超过30认为有效
{
filterContent += "(" + (j++).ToString() + ")/n" + replyContent;
}
}
regex = new Regex(@"<a /S*? href=(?<url>/S*?)>下一页</a>",RegexOptions.Singleline);
Match m = regex.Match(content);
if (!m.Success)
break;
else
url = m.Result("${url}");
}
LintSys.WriteLine("下载完成....",ConsoleColor.DarkGreen);
return Trans.ReplaceHtml(filterContent);
}
static SortedList[] GetTitle(string url,ref string nextUrl)
{
SortedList[] title;
string content = "";
CookieContainer cc = new CookieContainer();
content = Net.GetContent(url, ref cc);
// Console.WriteLine(content);
LintSys.WriteFile("log.txt", content);
//Regex regex = new Regex(@"<a href='(?<url>http://cache.tianya.cn/publicforum/S*)'.*?>(?<title>.*?)<.*?vwriter=(?<writer>)'.*?", RegexOptions.Singleline);
Regex regex = new Regex(@"<a href='(?<url>http://cache.tianya.cn/publicforum/content/S*)'.*?>(?<title>.*?)<.*?vwriter=(?<writer>.*?)'", RegexOptions.Singleline);
MatchCollection mc = regex.Matches(content);
// Console.WriteLine(mc.Count.ToString());
// return null;
title = new SortedList[mc.Count ];
for (int i = 0; i < mc.Count-1; i++)
{
title[i] = new SortedList();
title[i]["url"] = mc[i].Result("${url}");
title[i]["title"] = (new Regex(Reg.dirStr)).Replace(mc[i].Result("${title}"),"");
title[i]["writer"] = mc[i].Result("${writer}");
//Console.WriteLine(mc[i].Result("${url}") + mc[i].Result("${title}") + mc[i].Result("${writer}"));
}
title[0] = new SortedList();
regex=new Regex(@"<a href=(?<url>/S*)?>下一页</a>",RegexOptions.Singleline);
Match m = regex.Match(content);
if (m.Success)
{
nextUrl = m.Result("${url}");
}
return title;
}
}
}
http://info95.vicp.net/info95/non-cgi/usr/5/5_6.rar