深感找吃的地方不方便,于是盟生了把水木food版的文章搬到手机上去的想法。
不过E60找不到软件支持类似"桌面搜索"的文档内容搜索功能,这样面对上千篇从food版批量down下来的htm文章,找起来就相当吃力了。翻来弄去发现手机本身的自带搜索功能可以搜短信和邮件,而且内容搜索也符合我的要求。
于是今天就弄了一天,怎么把那些food版的批量下载后的htm文件,变成存在我手机上的电子邮件,以便出门在外也可以搜索。
整个过程还算顺利:
第一步,先架一个smtp和pop3服务器。
window2003可以架,边开我的2003虚拟机边在网上搜有没有更快的方法。
找到一个Foxmail Server for Windows 公开测试版,看起来直接就能用,就直接把我的虚拟机关掉了,太卡了。
然后按提示装下来,拿outlook测一下,好用。赞~
第二步,把一个个文件用smtp发出为一封封信。开始msdn找啊找,找到System.Net.Mail,搞定。
第三步,html格式不爽,想把那些tag都去掉,直接就能用手机看了。继续msdn,找到一个System.Web.RegularExpressions下的TextRegex,完全不是那么回事。再找,找到System.Web.HttpUtility.HtmlDecode(string),还有那么点用,但tag还是删不完。最后google之,找到了篇文章,rob代码过来用了。
第四步,跑一下工程,发了一千多封信,outlook收一下正常。不过在手机上收就不那么顺利了,数字就不支持四位数的,而且一下子收太多的信会超时。只好100封100封地发,再用手机一点一点地收。
总算弄完了,希望以后会有用。睡了。zzzzzZZZZZ
using System.Collections.Generic;
using System.Collections;
using System.Text.RegularExpressions;
using System.Text;
using System.IO;
using System.Web;
using System.Web.RegularExpressions;
using System.Xml;
using System.Net.Mail;
namespace GetHtmlTitleFromFolder
... {
class Program
...{
static void Main(string[] args)
...{
SmtpClient client = new SmtpClient("localhost", 8025);
string folder = @"C:/Documents and Settings/Administrator/My Documents/Food";
DirectoryInfo dir = new DirectoryInfo(folder);
Hashtable ht = GetFileTitle(dir);
FileInfo[] files = dir.GetFiles();
int step = 700;
int startNum = 301 + step;
int endNum = 400 + step;
for (int i = startNum; i < endNum; i++)
...{
FileInfo file = files[i];
string fileName = file.Name;
string title = fileName.Substring(0, fileName.Length - 4) + ht[fileName].ToString();
StreamReader sr = new StreamReader(file.FullName, Encoding.Default);
string txt = sr.ReadToEnd();
string body = StripHTML(txt);
client.Send("m@old.smth.food", "leezile@leezile.vicp.net", title, body);
}
}
public static Hashtable GetFileTitle(DirectoryInfo dir)
...{
//folder = @"C:/Documents and Settings/Administrator/My Documents/Food";
FileInfo[] files = dir.GetFiles();
ArrayList list = new ArrayList();
Hashtable ht = new Hashtable();
foreach (FileInfo file in files)
...{
StreamReader sr = new StreamReader(file.FullName, Encoding.Default);
string text = sr.ReadToEnd();
Regex reg = new Regex("<title>.*</title>");
Match m = reg.Match(text);
string s = m.Value;
string subStr = s.Substring(7, s.Length - 15);
StringBuilder sb = new StringBuilder(subStr);
sb = sb.Replace("●", ".");
sb = sb.Replace(" ", "");
sb = sb.Replace("?", "");
sb = sb.Replace("/", "");
sb = sb.Replace("zz", "");
sb = sb.Replace("Re:", "");
sb = sb.Replace("e:", "");
sb = sb.Replace("*", "=");
ht.Add(file.Name, sb.ToString());
}
return ht;
}
public static string StripHTML(string source)
...{
try
...{
string result;
// Remove HTML Development formatting
// Replace line breaks with space
// because browsers inserts space
result = source.Replace(" ", " ");
// Replace line breaks with space
// because browsers inserts space
result = result.Replace(" ", "====n====");
// Remove step-formatting
result = result.Replace(" ", string.Empty);
// Remove repeating speces becuase browsers ignore them
result = System.Text.RegularExpressions.Regex.Replace(result,
@"( )+", " ");
// Remove the header (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*head([^>])*>", "<head>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*head( )*>)", "</head>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(<head>).*(</head>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<P ALIGN=).*(</P>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all scripts (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*script([^>])*>", "<script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*script( )*>)", "</script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//result = System.Text.RegularExpressions.Regex.Replace(result,
// @"(<script>)([^(<script>.</script>)])*(</script>)",
// string.Empty,
// System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<script>).*(</script>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all styles (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*style([^>])*>", "<style>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*style( )*>)", "</style>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(<style>).*(</style>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert tabs in spaces of <td> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*td([^>])*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line breaks in places of <BR> and <LI> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*br( )*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*li( )*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line paragraphs (double line breaks) in place
// if <P>, <DIV> and <TR> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*div([^>])*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*tr([^>])*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*p([^>])*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove remaining tags like <a>, links, images,
// comments etc - anything thats enclosed inside < >
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<[^>]*>", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// replace special characters:
result = System.Text.RegularExpressions.Regex.Replace(result,
@" ", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"•", " * ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"‹", "<",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"›", ">",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"™", "(tm)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"⁄", "/",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<", "<",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@">", ">",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"©", "(c)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"®", "(r)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove all others. More can be added, see
// http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = System.Text.RegularExpressions.Regex.Replace(result,
@"&(.{2,6});", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// for testng
//System.Text.RegularExpressions.Regex.Replace(result,
// this.txtRegex.Text,string.Empty,
// System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// make line breaking consistent
result = result.Replace("", "");
// make line breaking consistent
result = result.Replace(" ", "");
// make line breaking consistent
result = result.Replace("====n====", " ");
// make line breaking consistent
result = result.Replace(":", ": ");
// make line breaking consistent
result = result.Replace(" ", " ");
// make line breaking consistent
result = result.Replace(" ", " ");
// Remove extra line breaks and tabs:
// replace over 2 breaks with 2 and over 4 tabs with 4.
// Prepare first to remove any whitespaces inbetween
// the escaped characters and remove redundant tabs inbetween linebreaks
result = System.Text.RegularExpressions.Regex.Replace(result,
"( )( )+( )", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"( )( )+( )", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"( )( )+( )", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"( )( )+( )", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove redundant tabs
result = System.Text.RegularExpressions.Regex.Replace(result,
"( )( )+( )", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove multible tabs followind a linebreak with just one tab
result = System.Text.RegularExpressions.Regex.Replace(result,
"( )( )+", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Initial replacement target string for linebreaks
string breaks = " ";
// Initial replacement target string for tabs
string tabs = " ";
for (int index = 0; index < result.Length; index++)
...{
result = result.Replace(breaks, " ");
result = result.Replace(tabs, " ");
breaks = breaks + " ";
tabs = tabs + " ";
}
// Thats it.
return result.Substring(1);
}
catch
...{
System.Windows.Forms.MessageBox.Show("Error");
return source;
}
}
}
}