1.开始
1.1 本文章仅供学习与交流。
1.2 正式开始
本文章的爬虫思路是通过网站上所有书本信息的页面如下图,然后获取其html的信息(如url,书名,作者等等),然后再通过访问书本的详细页面的url查找每章节的信息,再通过章节的url访问其章节的内容(其中书名的信息我通过爬虫存入数据库了,其目的是方便搜索和查看,最主要的原因是我自己yy一下本章教学没有用到持久化)
1.3 用vs新建一个.net core 的 web api
本人用的是3.1版本的,新建项目的操作无需多说,引用Nuget包如下图(本章内容不需要引用ef的三个包)
新建一个帮助类,其目的是帮助我哦们更加方便获取html文本内容和匹配我们所需要的内容(暂时可以不用细看直接用即可)
using PythonHelper.BQGDtos;
using RestSharp;
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace PythonHelper
{
public class HelperMe
{
//去除标签
public static string GetValues(string value, bool isKongGe = false)
{
var getvalues = showMatchValue(value, "(<.*?>)");
foreach (var item in getvalues)
{
value = value.Replace(item, "");
}
if (isKongGe)
{
value = value.Replace(@" ", "");
}
return value;
}
//获取html文本
public static async Task<string> GetTestOrgan(string GetHost2 = "", string url = "/")
{
RestClient restClient = new RestClient(GetHost2);
RestRequest restRequest = new RestRequest(url);
var response = await restClient.GetAsync(restRequest);
var GetTestTxt = GetAllString(response.Content);
return GetTestTxt;
}
//格式化换行和空格
public static string GetAllString(string fff)
{
return fff.Replace(" ", "").Replace("\r\n", "").Replace("\r", "").Replace("\n", "");
}
//匹配所有的匹配项,并且返回匹配的结果而且是list<string>
public static List<string> showMatchValue(string text, string expr)
{
Console.WriteLine("The Expression: " + expr);
MatchCollection mc = Regex.Matches(text, expr);
List<string> ass = new List<string>();
foreach (Match m in mc)
{
//Console.WriteLine(m.Groups[1]);
//Console.WriteLine(m.Groups[2]);
ass.Add(m.Groups[1].ToString());
}
return ass;
}
/// <summary>
/// 循环提取最后一层正则表达式匹配文本
/// </summary>
/// <param name="Url">地址|文本</param>
/// <param name="expr">公式列表</param>
/// <param name="IsUrl">是否是地址</param>
/// <returns></returns>
public static async Task<List<string>> GetListJiFirstStep(string Url, List<string> expr, bool IsUrl = true)
{
var txts = "";
if (IsUrl)
{
txts = await GetTestOrgan(Url);
}
else
{
txts = Url;
}
for (int i = 0; i < expr.Count; i++)
{
if (i + 1 == expr.Count)
{
return showMatchValue(txts, expr[i]);
}
else
{
txts = showMatchValue(txts, expr[i])[0];
}
}
return new List<string>();
}
/// <summary>
/// 返回一个集数的列表对象Dto
/// </summary>
/// <param name="txt">文本</param>
/// <param name="expr">名</param>
/// <param name="expr2">地址</param>
/// <returns></returns>
public static async Task<List<CollectionDto>> GetListJiSecondStep(List<string> txt, string expr, string expr2)
{
List<CollectionDto> GetDtos = new List<CollectionDto>();
foreach (var item in txt)
{
CollectionDto Ones = new CollectionDto();
Ones.CollectionDesc = showMatchValue(item, expr)[0];
Ones.GetHref = showMatchValue(item, expr2)[0];
GetDtos.Add(Ones);
}
return GetDtos;
}
/// <summary>
///
/// </summary>
/// <param name="Url">请求的时候地址</param>
/// <param name="expr">第一个选出公用的txt</param>
/// <param name="expr2">选出文本的正则表达式</param>
/// <param name="expr3">选出地址的正则表达式</param>
/// <returns></returns>
public static async Task<List<CollectionDto>> GetNovelJi(string Url, List<string> expr, string expr2, string expr3, bool IsUrl = true)
{
var texts = await GetListJiFirstStep(Url, expr, IsUrl);
return await GetListJiSecondStep(texts, expr2, expr3);
}
/// <summary>
/// 获取文本
/// </summary>
/// <param name="url"></param>
/// <param name="expr"></param>
/// <returns></returns>
public static async Task<string> GetJiTxt(string url, string expr, bool IsClear, bool IsKongGe)
{
var GetTxt = await GetTestOrgan(url);
var GetBookTxt = showMatchValue(GetTxt, expr)[0];
if (IsClear)
{
GetBookTxt = GetValues(GetBookTxt, IsKongGe);
}
return GetBookTxt;
}
}
}
1.4 分页的dto和输出dto
直接新建类,更改一下名字和下面给出代码的类名相同即可
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
namespace PythonHelper
{
public class TableDto<T>
{
public TableDto() {
data = new List<T>();
}
public int code { get; set; }
public string msg { get; set; }
public int count { get; set; }
public List<T> data { get; set; }
}
}
using System;
using System.Collections.Generic;
using System.Text;
namespace PythonHelper.BQGDtos
{
public class CollectionDto
{
public string NovelId { get; set; }
public string CollectionDesc { get; set; }
public string CollectionNo { get; set; }
public bool IsDelete { get; set; }
public DateTime ReportDate { get; set; }
public string GetHref { get; set; }
}
}
1.5 api代码
PNewHelper是帮助类自行更改自己命名的帮助类
[HttpPost]
public async Task<TableDto<CollectionDto>> GetNovalCollectionAll(int page,int limit,string url) {
try
{
//PNewHelper是帮助类自行更改自己命名的帮助类
GetInfos = await PNewHelper.GetNovelJi(url, new List<string>() { "<divid=\"list\">(.*?)</div>"
, "<dd>(.*?)</dd>" },
"<a.*?>(.*?)</a>", "<ahref=\'(.*?)\'>.*?</a>"
);
}
var getDatas = GetInfos.Skip((page == 0 ? 0 : (page - 1)* limit)).Take(limit).ToList();
return new TableDto<CollectionDto>()
{
code = 0,
msg = "",
data = getDatas,
count = GetInfos.Count
};
catch (Exception)
{
throw;
}
}
[HttpPost]
public async Task<string> GetCollecOfTxt(string url,string expr,bool IsClear=false, bool IsKongGe = false) {
try
{
if (string.IsNullOrEmpty(expr)) {
expr = "<divid=\"content\">(.*?)</div>";
}
if (string.IsNullOrEmpty(PNewHelper.showMatch(url,"www"))) {
url = "https://www.xbiquge.la"+url;
}
return await PNewHelper.GetJiTxt(url, expr, IsClear, IsKongGe);
}
catch (Exception)
{
throw;
}
}
1.6 测试打开swagger测试
注意:自己按照顺序来看啊