C#爬虫之WebClient、WebRequest、WebResponse应用
WebClient 获取网页Html
WebClient MyWebClient = new WebClient();
MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
Byte[] pageData = MyWebClient.DownloadData(url); //从指定网站下载数据
pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句
请求数据并保存为文件
try
{
WebRequest request = WebRequest.Create(urls);//图片src内容
WebResponse response = request.GetResponse();
//文件流获取图片操作
Stream reader = response.GetResponseStream();
string path = "H://imgs//" + count.ToString() + "//" +aa.ToString()+"//"+ i.ToString() + ".jpg"; //图片路径命名
FileStream writer = new FileStream(path, FileMode.OpenOrCreate, FileAccess.Write);
byte[] buff = new byte[512];
int c = 0; //实际读取的字节数
while ((c = reader.Read(buff, 0, buff.Length)) > 0)
{
writer.Write(buff, 0, c);
}
//释放资源
writer.Close();
writer.Dispose();
reader.Close();
reader.Dispose();
response.Close();
//下载成功
}
catch (Exception msg)
{
Console.Write(msg.Message);
return;
}
利用正则表达式和队列可以爬取网页上的图片,请看下面例子:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
using System.Text.RegularExpressions;
using System.Net;
using System.Collections;
namespace ConsoleApp16
{
class Program
{
public static string HtmlText(string url)
{
string pageHtml = "";
try
{
WebClient MyWebClient = new WebClient();
MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
Byte[] pageData = MyWebClient.DownloadData(url); //从指定网站下载数据
pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句
}
catch (WebException webEx)
{
Console.WriteLine(webEx.Message.ToString());
}
return pageHtml;
}
public static void save(int i, int count, int aa, string urls)
{
try
{
WebRequest request = WebRequest.Create(urls);//图片src内容
WebResponse response = request.GetResponse();
//文件流获取图片操作
Stream reader = response.GetResponseStream();
string path = "H://qimg//" + count.ToString() + "//" + aa.ToString() + "//" + i.ToString() + ".jpg"; //图片路径命名
FileStream writer = new FileStream(path, FileMode.OpenOrCreate, FileAccess.Write);
byte[] buff = new byte[512];
int c = 0; //实际读取的字节数
while ((c = reader.Read(buff, 0, buff.Length)) > 0)
{
writer.Write(buff, 0, c);
}
//释放资源
writer.Close();
writer.Dispose();
reader.Close();
reader.Dispose();
response.Close();
//下载成功
}
catch (Exception msg)
{
Console.Write(msg.Message);
return;
}
}
static void Main(string[] args)
{
Queue html = new Queue();//初始化网页链接队列
Queue img = new Queue();//初始化图片地址队列
int index = 0;
string pageHtml = HtmlText(网站主页urls);
Regex re = new Regex(正则表达式匹配下一级网页链接);
MatchCollection mc = re.Matches(pageHtml);
foreach (Match ma in mc)
{
string tmp = urls + ma.Value.ToString();
html.Enqueue(tmp);//入队
}
int i = 1;
while (html.Count > 0)
{
if (i < 30)
{
if (i % 2 == 0)
{
i++;
html.Dequeue();
continue;
}
}
else
Directory.CreateDirectory("H://qimg//" + i.ToString());//创建目录
string ss = (string)html.Dequeue();
for (int aa = 1; aa < 4; aa++)
{
string path = ss + "?" + "sub=" + aa.ToString();
string Html = HtmlText(path);
Regex res = new Regex(正则表达式匹配图片地址, RegexOptions.None);
MatchCollection mcs = res.Matches(Html);
foreach (Match mas in mcs)
{
string a = "https://" + mas.Value.ToString();
img.Enqueue(a);
}
Regex ree = new Regex(正则表达式, RegexOptions.None);//将找到的新的图片html加入队列
MatchCollection mce = ree.Matches(Html);
foreach (Match mae in mce)
{
string tmp = "https://" + mae.Value.ToString().Replace("\"","").Replace(">","");//替换字符
html.Enqueue(tmp);
}
Directory.CreateDirectory("H://qimg//" + i.ToString() + "//" + aa.ToString());
for(int s=1;img.Count>0;s++)
{
if (s <= 8)//保存找到的前八张图片
save(s, i, aa, (string)img.Dequeue());
else
img.Dequeue();
}
Console.WriteLine("ok");
}
html.Dequeue();//保存完当前网页图片,出队
i++;
}
}
}
}
写好正则表达式后,程序就会自己不断从网站上爬取图片,保存到本地。控制台可以使程序执行地更快。个人只是初学者,最近要用到才拿来玩玩,怕以后忘记才记录下来。