using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using Newtonsoft.Json;
static void Main(string[] args)
{
//var list = new List<info>();
//list.Add(new info()
//{
// title = "",
// url = ""
//});
//int count = 1;
string url = ""; // 需要获取的页面地址
WebClient wc = new WebClient();
wc.Encoding = Encoding.UTF8;
var simpleCrawlResult = wc.DownloadString(url); //页面文本
// HtmlDocument htmlDoc = new HtmlDocument();
// htmlDoc.LoadHtml(simpleCrawlResult);
// 获取所有a标签
string prttern = "<a(\\s+(href=\"(?<url>([^\"])*)\"|'([^'])*'|\\w+=\"(([^\"])*)\"|'([^'])*'))+>(?<text>(.*?))</a>";
var maths = Regex.Matches(simpleCrawlResult, prttern);
for (int i = 0; i < maths.Count; i++)
{
string urls = maths[i].Groups["url"].Value; // a标签的链接地址
string title = maths[i].Groups["text"].Value.Replace("?","?"); // a标签的内容
NextLinks(urls, title);
}
}
// 获取页面接口参数,模拟调用
public static string NextLinks(string url,string title)
{
WebClient wc2 = new WebClient();
wc2.Encoding = Encoding.UTF8;
var simpleCrawlResult2 = wc2.DownloadString(url);
// HtmlDocument htmlDoc2 = new HtmlDocument();
// htmlDoc2.LoadHtml(simpleCrawlResult2);
// HtmlAgilityPack.HtmlNod
c# 爬虫,图片url和音频url。 控制台程序// url地址下载到本地(MP3,图片)
最新推荐文章于 2024-08-20 22:07:59 发布