c#爬小说
第一把,用HttpWebRequest爬
前言
现在越来越难下到小说了,要么就是病毒,要么就是给钱啥的,或者可以在网页上直接看,可是我们是程序员啊,这能忍,我下不到,我还不能去爬么
一、HttpWebRequest
纯自带 不用下包,也就是说代码复制了就能用 来来来 抄抄抄
二、代码
界面代码 就一个框 外加个按钮
<Grid>
<Grid.RowDefinitions>
<RowDefinition Height="Auto"></RowDefinition>
<RowDefinition></RowDefinition>
</Grid.RowDefinitions>
<Grid.ColumnDefinitions>
<ColumnDefinition Width="Auto"></ColumnDefinition>
<ColumnDefinition ></ColumnDefinition>
<ColumnDefinition Width="Auto"></ColumnDefinition>
<ColumnDefinition Width="Auto"></ColumnDefinition>
</Grid.ColumnDefinitions>
<Label>网址:</Label>
<TextBox Grid.Column="1" Name="urlTb"
Text="https://www.paozww.com/biquge/348374/90167669.html"></TextBox>
<!--Text="https://www.douyinxs.com/bqg/1224636/299410236_2.html"></TextBox>-->
<TextBox Grid.Row="1" Grid.ColumnSpan="2" TextWrapping="Wrap" Name="infoTb"></TextBox>
<Button Grid.Column="2" Click="Button_Click">获取</Button>
</Grid>
2.逻辑代码
代码如下(示例):
string url = urlTb.Text;
int i = 0;
bool read = true;
Task.Run(() =>
{
while (read)
{
Thread.Sleep(100);
try
{
#region 读取网页
HttpWebRequest _request = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)_request.GetResponse();
if (response.StatusCode != HttpStatusCode.OK)
{
this.infoTb.Text += $"错误:{url}";
read = false;
break;
}
Encoding cd = System.Text.Encoding.GetEncoding(response.CharacterSet);
Stream resStream = response.GetResponseStream();
try
{
if (response.ContentEncoding.ToLower().Contains("gzip"))
{
resStream = new GZipStream(resStream, CompressionMode.Decompress);
}
}
catch { }
StreamReader sr = new StreamReader(resStream, cd);
string htmlContent = sr.ReadToEnd();
if (string.IsNullOrWhiteSpace(htmlContent))
{
break;
}
#endregion
string nextpage = SubstringSingle(htmlContent, "<a id=\"next\" href=\"", "\"> 下一页</a>");
if (string.IsNullOrWhiteSpace(nextpage))
{
nextpage = SubstringSingle(htmlContent, "<a id=\"next\" href=\"", "\">下一章</a>");
}
if (string.IsNullOrWhiteSpace(nextpage))
{
read = false;
break;
}
string title = SubstringSingle(htmlContent, "article-author=\"", "\">");
nextpage = "https://www.paozww.com" + nextpage;
url = nextpage;
var novelContentList = SubstringMultiple(htmlContent, "<p>", "</p>");
string novelContent = $"\r\n章节:{title}\r\n{string.Join("\r\n\r\n", novelContentList)}";
string outputPath = "自己写目录啊亲\\自己写名字啊.txt";
StreamWriter sw = new StreamWriter(outputPath, true);
sw.WriteLine(novelContent);
sw.Close();
i++;
this.Dispatcher.Invoke(new Action(() =>
{
infoTb.Text = i.ToString();
}));
}
catch (Exception ex)
{
this.Dispatcher.Invoke(new Action(() =>
{
this.infoTb.Text += $"错误:{ex}";
this.urlTb.Text = url;
}));
//错了等几秒再来
Thread.Sleep(3000);
}
}
});
/// <summary>
/// (批量)截取字符串中开始和结束字符串中间的字符串
/// </summary>
/// <param name="source">源字符串</param>
/// <param name="startStr">开始字符串</param>
/// <param name="endStr">结束字符串</param>
/// <returns>中间字符串</returns>
public List<string> SubstringMultiple(string source, string startStr, string endStr)
{
Regex rg = new Regex("(?<=(" + startStr + "))[.\\s\\S]*?(?=(" + endStr + "))", RegexOptions.Multiline | RegexOptions.Singleline);
MatchCollection matches = rg.Matches(source);
List<string> resList = new List<string>();
foreach (Match item in matches)
resList.Add(item.Value);
return resList;
}
/// <summary>
/// 截取字符串中开始和结束字符串中间的字符串
/// </summary>
/// <param name="source">源字符串</param>
/// <param name="startStr">开始字符串</param>
/// <param name="endStr">结束字符串</param>
/// <returns>中间字符串</returns>
public string SubstringSingle(string source, string startStr, string endStr)
{
Regex rg = new Regex("(?<=(" + startStr + "))[.\\s\\S]*?(?=(" + endStr + "))", RegexOptions.Multiline | RegexOptions.Singleline);
return rg.Match(source).Value;
}
问题与总结
有啥爬虫需求的可以私信我,嘿嘿(打广告啊)
使用这个方法呢,有的网站是不给爬的(人家反爬啊)
var novelContentList = SubstringMultiple(htmlContent, “<p>”, “</p >”);这个地方的第二三个参数自己去哪个网页上看看哈,大多数长这样(意思是获取第二三个参数之间的所有数据列表,其实就是小说内容啦,但是有些稍有差异,得自己F12看看),还有下一章下一页的都要根据膜表网页改改哈