由于网络的开放性,我们浏览网站都会把数据发送到本地,这就造就了采集的环境.
之前研究采集一段时间了,在刘建的帮助下,终于可以把别人网站的内容采集到自己的网站上面显示出来,但是这样有一个很大的弊端,那就是如果被采集的网站关闭了,你的网站也因为采集不到内容而显示不了,解决这个问题的最好办法还是把采集到的数据存放到本地,这样就算别人挂了也对自己的网站没影响,经过和刘建的讨论,总结了如下采集流程,今天把它实现了.
根据流程图,代码如下:
using System;
using System.Collections.Generic;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Data.SqlClient;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
public partial class chapter : System.Web.UI.Page
{
protected string title, content,newContent, bookurl, readurl, provpage, nextpage, zhangjie, keywords, description;
protected void Page_Load(object sender, EventArgs e)
{
bool hasbook = true;//书名是否存在
bool hassecction = true;//章节是否存在
string bookid = Request.QueryString["bookid"];
string sectionid = Request.QueryString["chapterid"];
jiangs_Tools.check_str(bookid);
jiangs_Tools.check_str(sectionid);
jiang_Db_Sql newdb = new jiang_Db_Sql();
string sql = "select count(*) from [book] where [bookid]=" + bookid;
newdb.Open();
hasbook = newdb.Exec_Sql(sql);
newdb.Close();
sql = "select count(*) from [section] where [sectionid]=" + sectionid;
newdb.Open();
hassecction = newdb.Exec_Sql(sql);
newdb.Close();
if (!hasbook)//书名不存在,添加
{
string html = jiangs_Rex.GetRemoteHtmlCode("http://www.lovepd.com/chapter.php?cutid=" + Request.QueryString["cutid"] + "&bookid=" + Request.QueryString["bookid"] + "&chapterid=" + Request.QueryString["chapterid"]);
html = html.Replace("http://www.lovepd.com/", "");
html = html.Replace("http://lovepd.com/", "");
html = jiangs_Rex.ReplaceListUrl(html, @"(chapter)\.php\?cutid=(\d+)&bookid=(\d+)&chapterid=(\d+)", 4);
html = jiangs_Rex.ReplaceListUrl(html, @"(read)\.php\?cutid=(\d+)\&bookid=(\d+)", 3);//小说目录
content = jiangs_Rex.GetRegValue(@"(?<=<p align=""left"">)(.*?)(?=</p>)", html, 1);//内容
html = jiangs_Rex.ReplaceListUrl(html, @"readend\.php\?bookid=(\d+)", 1, "/read/" + Request.QueryString["cutid"]);//小说目录
content = jiangs_Rex.ReplaceContentImgUrl(content);//如果是图片小说,转换
content = content.Replace("第九文学 www.d9123.com", "");
title = jiangs_Rex.GetRegValue(@"(\w+)最新章节列表</b>", html, 1);//小说名称
bookurl = "/book/" + Request.QueryString["bookid"] + ".html";//简介url
readurl = jiangs_Rex.GetRegValue(@"main_page = ""([^""]+)""", html, 1);//前一页url
provpage = jiangs_Rex.GetRegValue(@"back_page = ""([^""]+)""", html, 1);//前一页url
nextpage = jiangs_Rex.GetRegValue(@"next_page = ""([^""]+)""", html, 1);//下一页url
zhangjie = jiangs_Rex.GetRegValue(@"(?<=<h1>)([^>]+)(?=</h1>)", html, 1);//章节
Response.Flush();//先输出内容,减少用户等待
newContent = content;//替换存到数据库的图片新路径
sql = string.Format("insert into [book]([bookId],[bookName]) values('{0}','{1}')", bookid, title);
newdb.Open();
newdb.ExecSql(sql);//添加书名
newdb.Close();
string imgContent = jiangs_Rex.GetRegValue(@"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", content, "|", "");//取得内容图片
if (!string.IsNullOrEmpty(imgContent))
{
savePic(imgContent);//保存图片
}
sql = sql = string.Format("insert into [section]([sectionId],[bookId],[sectionTitle],[sectionContent],[readUrl],[provPage],[nextPage]) values('{0}','{1}','{2}','{3}','{4}','{5}','{6}')", sectionid, bookid, zhangjie, newContent, readurl, provpage, nextpage);
newdb.Open();
newdb.ExecSql(sql);//添加章节
newdb.Close();
}
else if(!hassecction)//章节不存在
{
string html = jiangs_Rex.GetRemoteHtmlCode("http://www.lovepd.com/chapter.php?cutid=" + Request.QueryString["cutid"] +
"&bookid=" + Request.QueryString["bookid"] + "&chapterid=" + Request.QueryString["chapterid"]);
html = html.Replace("http://www.lovepd.com/", "");
html = html.Replace("http://lovepd.com/", "");
html = jiangs_Rex.ReplaceListUrl(html, @"(chapter)\.php\?cutid=(\d+)&bookid=(\d+)&chapterid=(\d+)", 4);
html = jiangs_Rex.ReplaceListUrl(html, @"(read)\.php\?cutid=(\d+)\&bookid=(\d+)", 3);//小说目录
content = jiangs_Rex.GetRegValue(@"(?<=<p align=""left"">)(.*?)(?=</p>)", html, 1);//内容
html = jiangs_Rex.ReplaceListUrl(html, @"readend\.php\?bookid=(\d+)", 1, "/read/" + Request.QueryString["cutid"]);//小说目录
content = jiangs_Rex.ReplaceContentImgUrl(content);//如果是图片小说,转换
content = content.Replace("第九文学 www.d9123.com", "");
title = jiangs_Rex.GetRegValue(@"(\w+)最新章节列表</b>", html, 1);//小说名称
bookurl = "/book/" + Request.QueryString["bookid"] + ".html";//简介url
readurl = jiangs_Rex.GetRegValue(@"main_page = ""([^""]+)""", html, 1);//前一页url
provpage = jiangs_Rex.GetRegValue(@"back_page = ""([^""]+)""", html, 1);//前一页url
nextpage = jiangs_Rex.GetRegValue(@"next_page = ""([^""]+)""", html, 1);//下一页url
zhangjie = jiangs_Rex.GetRegValue(@"(?<=<h1>)([^>]+)(?=</h1>)", html, 1);//章节
Response.Flush();//先输出内容,减少用户等待
newContent = content;//替换存到数据库的图片新路径
string imgContent = jiangs_Rex.GetRegValue(@"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", content, "|", "");//取得内容图片
if (!string.IsNullOrEmpty(imgContent))
{
savePic(imgContent);//保存图片
}
sql = sql = string.Format("insert into [section]([sectionId],[bookId],[sectionTitle],[sectionContent],[readUrl],[provPage],[nextPage]) values('{0}','{1}','{2}','{3}','{4}','{5}','{6}')",
sectionid, bookid, zhangjie, newContent,readurl,provpage,nextpage);
newdb.Open();
newdb.ExecSql(sql);//添加章节
newdb.Close();
}
else//章节,书名都存在,直接读数据库
{
sql = "select a.[bookName],b.[sectionTitle],b.[sectionContent],b.[readUrl],b.[provPage],b.[nextPage] from [book] as a,[section] as b where a.[bookId]=b.[bookId] and b.[sectionId]=" + sectionid;
newdb.Open();
SqlDataReader reader = newdb.Re_dr(sql);
if (reader.Read())
{
title = reader[0].ToString();//书名
zhangjie = reader[1].ToString();//章节名称
content = reader[2].ToString();//内容
readurl = reader[3].ToString();//书目
provpage=reader[4].ToString();//上一页
nextpage=reader[5].ToString();//下一页
bookurl = "/book/" + Request.QueryString["bookid"] + ".html";//书页
}
reader.Close();
newdb.Close();
}
this.Page.Title = title + ">> " + zhangjie + " - 天下小说网";
keywords = "\"" + title + "最新章节列表," + title + "全文阅读," + title + "TXT电子书下载," + title + "JAR电子书下载," + title + "UMD电子书下载\"";
description = "\"天下小说网为小说爱好者提供" + title + "最近更新章节阅读," + title + "全文在线阅读," + title
+ "最新章节电子书下载(包括" + title + "的TXT格式下载、" + title + "的JAR格式下载、" + title + "的UMD格式下载)\"";
}
public void savePic(string imgcontent)
{
if (string.IsNullOrEmpty(imgcontent))
{
return;
}
imgcontent = imgcontent.Remove(0, 1);//去除第一个 |
string[] temp = imgcontent.Split('|');
for (int i = 0; i < temp.Length; i++)//有几张图片就存几次
{
string newUrl = temp[i];//重写URL
newUrl= Regex.Replace(newUrl, @"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", "files/article/attachment/$1/$2/$3/$4.$5");
string[] tem = temp[i].Split('/');
string imgName = tem[tem.Length - 1];//图片名称
string picurl = Request.QueryString["url"];
picurl = "http://2.yxmimi.com/" + newUrl;//目标网站图片地址
WebClient objWebClient = new WebClient();
try
{
byte[] bResponse = objWebClient.DownloadData(picurl);//将下载数据保存到byte[]数组中
FileStream fs = new FileStream(Server.MapPath("/pic/section/"+imgName), FileMode.Create, FileAccess.Write);
fs.Write(bResponse, 0, bResponse.Length);//将bytes[]数组中的图片数据保存到硬盘
fs.Flush();
fs.Close();
}
catch (Exception ex)
{
//Response.Write( ex.ToString());
}
}
newContent = Regex.Replace(newContent, @"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", "/pic/section/$1_$2_$3_$4.$5");
}
}