自述:在工作中遇到很多有趣的需求,在实施过程中看到了自己的很多不足。
仅在这里记录收获,积累成就感,愉悦自己,感谢他人!
概述:工作中远程测试别人的接口,竟引发了想爬小说看的欲望!!!
public string GetHtml(string Url)
{
string datas = "";
//远程访问
WebRequest request = WebRequest.Create(Url);
request.Method = "GET";
request.Headers.Add("Accept-Encoding", "gzip,deflate");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
StreamReader reader;
//抓取的网页是压缩状态
if (response.ContentEncoding.ToLower() == "gzip")
{
using (Stream stream = response.GetResponseStream())
{
using (var zipStream = new GZipStream(stream, CompressionMode.Decompress))
{
reader = new StreamReader(zipStream, Encoding.GetEncoding("gb2312"));
datas = reader.ReadToEnd();
}
}
}
else
{
reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
datas = reader.ReadToEnd();
}
return datas;
}
string HeadUrl = "http://www.166xs.com/xiaoshuo/53/53091/";
//获取书
protected void GetBook_Click(object sender, EventArgs e)
{
Stopwatch st = new Stopwatch();
st.Start();
List<MenuType> menus = GetMenu();
//判断文件夹存在否
if (!Directory.Exists(@"E:\cx"))
{
Directory.CreateDirectory(@"E:\cx");
}
string fileTxt = @"E:\cx\NoveL_一路凡尘.txt";
FileStream fs= new FileStream(fileTxt, File.Exists(fileTxt)?FileMode.Append:FileMode.Create, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs);
foreach(var tt in menus)
{
string datas = GetHtml(HeadUrl + tt.SubUrl);//获取html数据
//将数据写入txt
string[] data = datas.Replace(" ", "%").Split('%');
sw.WriteLine("\r\n\t\t\t\t\t\t\t" + tt.Menu);//章节换行
for (int i = 1; i < data.Count(); i++)
{
if (i == data.Count() - 1)//处理章节的最后一行
{
sw.WriteLine(data[i].Substring(0, data[i].IndexOf("166小说阅读网")));
continue;
}
sw.WriteLine("\t" + data[i].Replace("<br /><br />", ""));
}
}
Time.Text = (st.ElapsedMilliseconds * 0.0000167).ToString();
st.Stop();
sw.Close();
fs.Close();
}
public List<MenuType> GetMenu()
{
List<MenuType> menus = new List<MenuType>();
#region 目录copy到txt文件后读取(二选一)
string FilePath = @"E:\cx\aa.txt";
string line = "";
MenuType menuLine;
using (StreamReader sr = new StreamReader(FilePath, Encoding.GetEncoding("gb2312")))
{
while ((line = sr.ReadLine()) != null)
{
if (line == "")
break;
menuLine = new MenuType();
menuLine.Menu = line.TrimEnd('>', 'a', '/', '<').Substring(line.IndexOf('>') + 1);
menuLine.SubUrl = line.Split('\"')[1];
menus.Add(menuLine);
}
}
#endregion
#region 直接访问网页获取目录及连接 (二选一)
string MenuHtml = GetHtml(HeadUrl);
//<dd><a href="31916584.html">第520章 摊牌(一)(捉虫)</a></dd>
Regex re = new Regex("<dd><a[^>]+?href=\"([^\"]+)\"[^>]*>([^<]+)</a></dd>");
MatchCollection matchs = re.Matches(MenuHtml);
MenuType menu;
foreach (var tem in matchs)
{
menu = new MenuType();
menu.Menu = tem.ToString().Replace("</a></dd>", "").Split('>')[2];
menu.SubUrl = tem.ToString().Split('\"')[1];
menus.Add(menu);
}
#endregion
return menus;
}
//目录对象
public class MenuType
{
public string SubUrl{get;set;}
public string Menu{get;set;}
}
测试了一下,用了4.0151643min,待完善改进。
参考:http://www.cnblogs.com/cang12138/p/7464226.html
中文乱码:http://blog.csdn.net/zhuyu19911016520/article/details/46647001
自我提醒:在调试的过程中,编码的坑把自己买的好深!值得记住的是获取的网页一般都是进行过压缩的,需要解压处理,最后确认网页中文的编码方式。
一路积累,一路成长!欢迎见证!