现在市面上大部分都是PYthon 做的爬虫,这里另辟蹊径用一下c#
本文获取的网站是:https://www.bqktxt.com
有更好的方法欢迎大佬指点
好了!开始吧!
- 首先创建一个winfrom窗体应用,并画出窗体
用到的组件有:Lable,textbox,button,linklable,progressBar
各位注意区别
2. 创建一个接收类(用于传参)
- 检测URL按钮代码
这一步的主要作用是把网页的所有结构下载下来然后进行拆解,得到自己想要的部分。
我这里获取了"章节标题",“文章名称”,"每一章节的URL"等需要的信息。
我们来看一下网页的HTML然后可以更好的写正则表达式来获取内容
根据上图可以看到章节的内容全部都在dd标签里面,然后还分了【正文部分】和【最新章节】,一般来说获取正文部分就可以,看了他的结构以后就可以开始写代码啦!
try
{
WebClient MyWebClient = new WebClient();
MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
Byte[] pageData = MyWebClient.DownloadData("https://www.bqktxt.com/38_38836/");//我这里写死了可以在"下载URL那个输入框里面输入" //从指定网站下载数据
string pageHtml = ""; //如果获取网站页面采用的是GB2312,则使用这句
if (pagetype == "GBK" || pagetype == "GB2312")
{
pageHtml = Encoding.Default.GetString(pageData);
}
if (pagetype == "UTF-8")
{
pageHtml = Encoding.UTF8.GetString(pageData);
}
Regex redl = new Regex(@"(?is)(?<=<dl>)(?:(?!</dl).)*"); //<dl>内容正则匹配
Regex retile = new Regex(@"(?is)(?<=<h2>)(?:(?!</h2).)*");//<h2>内容正则匹配
Regex redd = new Regex(@"(?is)(?<=<dd>)(?:(?!</dd).)*");//<dd>内容正则匹配
//遍历出标题
foreach (Match retiles in retile.Matches(pageHtml))
{
tilepage = retiles.Value;
}
foreach (Match rdl in redl.Matches(pageHtml))
{
ddpage += rdl.Value;
}
HtmlDocument dod = new HtmlDocument();
dod.LoadHtml(pageHtml);
HtmlNode nodesd = dod.DocumentNode.SelectSingleNode("span[@class='last']/a");
string all = dod.DocumentNode.InnerText;
//定义正文规则
string name = $"<dt>《{tilepage}》正文卷</dt>";
//获取正文在全文的位置
int cp = ddpage.IndexOf(name);
//获取正文
string zhengwen = ddpage.Remove(0, cp);//.Replace(" ","");
foreach (Match redds in redd.Matches(zhengwen))
{
apage += redds.Value + "\r\n";
}
//HtmlNode 获取a标签内容
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(apage);
HtmlNode nodes = doc.DocumentNode.SelectSingleNode("/a");
//章节标题
biaoti = doc.DocumentNode.InnerText;
Console.WriteLine("s");
Regex reg = new Regex(@"(?is)<a[^>]*?href =(['""\s]?)(?<href>[^'""\s]*)\1[^>]*?>");
MatchCollection match = reg.Matches(apage);
foreach (Match m in match)
{
url.Add(m.Groups["href"].Value);
}
label3.Text = tilepage;
label5.Text = url.Count.ToString()+"章";
MessageBox.Show("检测成功!可以下载");
flag = true;
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
4.然后我们进行完善窗体按钮(选择存放文件的path)
FolderBrowserDialog folderBrowserDialog1 = new FolderBrowserDialog();
if (folderBrowserDialog1.ShowDialog() == DialogResult.OK) //选择存储路径
{
string zipPath = folderBrowserDialog1.SelectedPath;
textBox2.Text = zipPath;
}
- 现在到了最重要的一步了
创建一个类我这里是(HttpHelper.cs)
代码如下!
public class HttpHelper
{
public delegate void UpdateUI(int step);//声明一个更新主线程的委托
public UpdateUI UpdateUIDelegate;
public delegate void AccomplishTask();//声明一个在完成任务时通知主线程的委托
public AccomplishTask TaskCallBack;
public async void Write(object databased)
{
database sd = new database(); //接收类
sd = (database)databased;//拆箱
int urlmuns = sd.urlmuns;
List<string> urldata= sd.url;
string filepath =sd.filepath;
string pagetype = sd.pagetype;
string zjname = sd.zjname;
string a = "";
string pageHtml = "";//获取的的html页面
string t = "";//章节标题
string ads = "";//过度参数
string regexstr = @"<[^>]*>";//去除HTML标签
string regexstrs = @"<[^>]*>"; //去除HTML标签
Regex regexs = new Regex(@"<div id=""content"" class=""showtxt"">.*?</div>");
Regex reso = new Regex(@"<h1>.*?</h1>");
WebClient MyWebClient = new WebClient();
string urldone = filepath + "\\" + zjname + ".txt";
List<string> article = new List<string>();
for (int i = 0; i < urlmuns; i++)
{
article.Add("https://www.bqktxt.com" + urldata[i]);
//把每一章节的URL 拼接一下.
}
for (int j = 0; j < article.Count; j++)
{
//开始获取
MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
Byte[] pageData = MyWebClient.DownloadData(article[j]); //从指定网站下载数据
//如果获取网站页面采用的是GB2312,则使用这句
if (pagetype == "GBK" || pagetype == "GB2312")
{
pageHtml = Encoding.Default.GetString(pageData);
}
if (pagetype == "UTF-8")
{
pageHtml = Encoding.UTF8.GetString(pageData);
}
MatchCollection match = regexs.Matches(pageHtml);
foreach (Match m in match)
{
a = m.Value;
}
MatchCollection matchd = reso.Matches(pageHtml);
foreach (Match m in matchd)
{
t = m.Value;
}
t = Regex.Replace(t, regexstrs, string.Empty, RegexOptions.IgnoreCase);
//去除HTML标签
ads = Regex.Replace(a, regexstr, string.Empty, RegexOptions.IgnoreCase).Replace(" ", "");
//删除多余代码只显示中文和标点
string newads = ads.Replace("app2();", "");
newads = newads.Replace("chaptererror();", "");
newads = newads.Replace("read2();", "");
try
{
if (!File.Exists(urldone))//是否有这个文件,有的话覆盖,没有的话创建
{
File.Create(urldone).Close();
using (StreamWriter sw = new StreamWriter(urldone,false,Encoding.UTF8))
{
sw.WriteLine(t);//写入章节标题
sw.WriteLine(newads);//写入章节内容
sw.Close();//关闭
sw.Dispose();
}
}
else
{
using (StreamWriter sw = new StreamWriter(urldone, true))//加true是在原有的文本里面继续写入
{
await sw.WriteLineAsync("\r\n"+t+"\r\n"+ newads);
sw.Close();
sw.Dispose();
}
}
UpdateUIDelegate(1);//更新UI动画
}
catch (Exception ex)
{
}
}
TaskCallBack();//异步回调
}
}
6.在winfrom程序的"全文下载"按钮调用刚刚写好的方法
database da = new database(); //这里是装箱
da.urlmuns = url.Count;
da.url = url;
da.filepath = textBox2.Text;
da.pagetype = pagetype;
da.zjname = tilepage;
int taskCount = url.Count; //总量
this.progressBar1.Maximum = taskCount;//UI的最大数值
this.progressBar1.Value = 0;//初始
HttpHelper dataWrite = new HttpHelper();//实例化一个写入数据的类
dataWrite.UpdateUIDelegate += UpdataUIStatus;//绑定更新任务状态的委托
dataWrite.TaskCallBack += Accomplish;//绑定完成任务要调用的委托
Thread thread = new Thread(new ParameterizedThreadStart(dataWrite.Write));
thread.IsBackground = true;
thread.Start(da);
然后是用的两个方法,就在winfrom的代码区定义就行
//更新UI
private void UpdataUIStatus(int step)
{
if (InvokeRequired)
{
this.Invoke(new AsynUpdateUI(delegate (int s)
{
this.progressBar1.Value += s;
this.label7.Text = this.progressBar1.Value.ToString() + "/" + this.progressBar1.Maximum.ToString();
}), step);
}
else
{
this.progressBar1.Value += step;
this.label7.Text = this.progressBar1.Value.ToString() + "/" + this.progressBar1.Maximum.ToString();
}
}
private void Accomplish()
{
//还可以进行其他的一些完任务完成之后的逻辑处理
MessageBox.Show("任务完成");
}
这些全部写好以后就可以开始运行啦,看看效果吧!
好了本篇文章就到这里啦欢迎大佬指点