爬取整站链接我采用的是递归算法,递归算法有个缺点就是栈溢出(可以用Thread控制栈的大小 Thread myThread = new Thread(new ThreadStart(MyMethod), 1024 * 1024);),所有可以用循环来写
由于我是用内存直接输出所有链接,所以有内存溢出的问题,可以把程序编译为64位,物理内存有多大就用多大,但是单台计算机不可能无限内存,所有可以用数据库来做
下面是代码 安装包HtmlAgilityPack 改进了 乱码是HtmlAgilityPack的问题 编码问题 自己解决吧 我很懒
public class WebReptile//调用 GetHtmlLinks("https://www.zzxx.org");
{
static List<string> list = new List<string>();
static List<string> list2 = new List<string>();
static int i = 0;
static int i2 = 0;
static int i3 = 0;
static HtmlWeb webClient = new HtmlWeb();
static HttpClient client = new HttpClient();
static HtmlAgilityPack.HtmlDocument doc = null;
static HtmlNodeCollection hrefList = null;
static string str;
static Dictionary<string, string> dic = new Dictionary<string, string>();
static WebReptile()
{
webClient.PreRequest = r => { r.Timeout = 10000;return true; };
client.Timeout = TimeSpan.FromSeconds(10);
}
public static void GetHtmlLinks(string url, string s = null)
{
try
{
if (!url.ToLower().Contains("java") && !url.ToLower().Contains("http"))
{
doc = webClient.Load(s + url);
hrefList = doc.DocumentNode.SelectNodes(".//a[@href]");
i++;
i3++;
Console.WriteLine("第二访问");
try
{
Console.WriteLine(i3 + doc.DocumentNode.SelectSingleNode("//title").InnerText + " " + url);
}
catch (Exception)
{
}
Console.WriteLine();
if (hrefList != null)
{
foreach (HtmlNode href in hrefList)
{
HtmlAttribute att = href.Attributes["href"];
if (!dic.ContainsKey(att.Value))
{
dic.Add(att.Value, att.Value);
list.Add(att.Value);
}
}
for (int j = i - 1; j < list.Count; j++)
{
GetHtmlLinks(list[j], str);
}
list.Clear();
i = 0;
}
}
if (url.ToLower().Contains("http"))
{
doc = webClient.Load(url);
hrefList = doc.DocumentNode.SelectNodes(".//a[@href]");
i2++;
i3++;
str = GetUrl(url);
Console.WriteLine("第一访问");
try
{
Console.WriteLine(i3 + doc.DocumentNode.SelectSingleNode("//title").InnerText + " " + url);
}
catch (Exception)
{
}
// + doc.DocumentNode.SelectSingleNode("//title").InnerText
Console.WriteLine();
if (hrefList != null)
{
foreach (HtmlNode href in hrefList)
{
HtmlAttribute att = href.Attributes["href"];
if (att.Value.ToLower().Contains(url.ToLower()) || att.Value.ToLower().Contains("java"))
{
continue;
}
if (!dic.ContainsKey(att.Value))
{
dic.Add(att.Value, att.Value);
list2.Add(att.Value);
}
}
for (int j = i2 - 1; j < list2.Count; j++)//http://www.123.com /123.com
{
GetHtmlLinks(list2[j], str);
}
list.Clear();
i2 = 0;
}
}
}
catch (Exception)
{
}
}
static string GetUrl(string s)
{
string str = null;
string[] arr = s.Split('/');
for (int i = 0; i < arr.Length; i++)
{
if (i == 0)
{
str = arr[0] + "//";
}
if (i == 1)
{
str += arr[2];
return str;
}
}
return null;
}
public static bool IsResourceUrl(string url)
{
try
{
HttpResponseMessage rs = client.GetAsync(url).Result;
string s = rs.Content.Headers.ContentType.MediaType;
Console.WriteLine(s);
if (s.StartsWith("text/html"))
{
Console.WriteLine(1);
return false;
}
if (s.StartsWith("application/"))
{
Console.WriteLine("是应用软件");
}
else if (s.StartsWith("audio/"))
{
Console.WriteLine("是音乐文件");
}
else if (s.StartsWith("image/"))
{
Console.WriteLine("是图片文件");
}
else if (s.StartsWith("video/"))
{
Console.WriteLine("是视频文件");
}
else
{
Console.WriteLine("是资源文件");
}
return true;
}
catch (Exception)
{
return true;
}
}
}
调用
WebReptile.GetHtmlLinks("https://www.zzxx.org");或
WebReptile.GetHtmlLinks("https://www.zzxx.org/xs/24183/");或
WebReptile.GetHtmlLinks("https://www.baidu.com");或
WebReptile.GetHtmlLinks("https://www.kugou.com")