1. 随便找个公众号地址我们来抓一下代码
(随便找的)
2.我们正常的抓
/// <summary>
/// 根据网址的URL,获取源代码HTML
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetHtmlByUrl(string url)
{
using (WebClient wc = new WebClient())
{
try
{
wc.UseDefaultCredentials = true;
wc.Proxy = new WebProxy();
wc.Proxy.Credentials = CredentialCache.DefaultCredentials;
wc.Credentials = System.Net.CredentialCache.DefaultCredentials;
byte[] bt = wc.DownloadData(url);
string txt = System.Text.Encoding.GetEncoding("GB2312").GetString(bt);
switch (GetCharset(txt).ToUpper())
{
case "UTF-8":
txt = System.Text.Encoding.UTF8.GetString(bt);
break;
case "UNICODE":
txt = System.Text.Encoding.Unicode.GetString(bt);
break;
default:
break;
}
return txt;
}
catch (Exception ex)
{
return null;
}
}
}
/// <summary>
/// 从HTML中获取获取charset
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static string GetCharset(string html)
{
string charset = "";
Regex regCharset = new Regex(@"content=[""'].*\s*charset\b\s*=\s*""?(?<charset>[^""']*)", RegexOptions.IgnoreCase);
if (regCharset.IsMatch(html))
{
charset = regCharset.Match(html).Groups["charset"].Value;
}
if (charset.Equals(""))
{
regCharset = new Regex(@"<\s*meta\s*charset\s*=\s*[""']?(?<charset>[^""']*)", RegexOptions.IgnoreCase);
if (regCharset.IsMatch(html))
{
charset = regCharset.Match(html).Groups["charset"].Value;
}
}
return charset;
}
3. 我们将爬出来的网页保存一哈瞅瞅看
/// <summary>
/// 保存Html
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static bool SaveHtml(string html)
{
try
{
using (StreamWriter sw = new StreamWriter("MyHtml.html", false, Encoding.UTF8))
{
sw.WriteLine(html);//将字符串写入到文本中
}
return true;
}
catch (Exception ex)
{
return false;
}
}
这样调用
var url = "https://mp.weixin.qq.com/s?__biz=MjM5MDk4NTg2MA==&mid=2652650152&idx=1&sn=93e4aeb524ee94cb64d1e9dbd5cf0266&chksm=bd54e1438a236855e93c27f0565b91380277ab484f8deda25745986b295b58046d75e52a68e2&scene=21#wechat_redirect";
var backHtml = GetHtmlByUrl(url);
var saveResult = SaveHtml(backHtml);
这是原网页
这是我们爬到本地的网页
此时我们F12打开调试可以看到,很多错误信息
我们可以看到,js用的是本地的地址,我们这里需要把原来Html的代码给改了(location.protocol,location.href,location.host) 采用绝对路径
4.我们采用第三方的 AngleSharp 来改Html代码 (以前都写正则,现在忘了怎么写了。。),直接Nuget安装就行了
/// <summary>
/// 处理html
/// </summary>
/// <param name="html"></param>
/// <param name="url">请求地址</param>
/// <returns></returns>
public static string HandleHtml(string html, string url)
{
var parser = new HtmlParser();
var document = parser.ParseDocument(html);
Uri myurl = new Uri(url);
//追加自定义节点
var addNode = document.CreateElement("script");
addNode.TextContent = $"var newhref=\"{url}\";var newhost=\"{myurl.Host}\";var newprotocol=\"{myurl.Scheme}:\";";
document.Head.Append(addNode);
var newOuterHtml = document.DocumentElement.OuterHtml;
//全局替换 "//res
return newOuterHtml.Replace("\"//res", "\"https://res").Replace("location.protocol", "newprotocol").Replace("location.href", "newhref").Replace("location.host", "newhost");
}
调用方式我们也改一下
if (!string.IsNullOrWhiteSpace(backHtml))
{
var newHtml = HandleHtml(backHtml, url);
var saveResult = SaveHtml(newHtml);
}
然后我们在看看什么效果
图片出来了,视频没有出来。。。
继续打开调试,发现是有些js没有加载,
还是用的本地的路径,因为html引用的js还是用的网络地址,除非把js下载下来,于是,我们把爬下面的网页挂在IIS下试试
一个是图片未授权,一个是图片跨域 。。
未授权好办,直接html加一段代码 ,跨域我想到的办法是自己服务器替换一次(就是说图片地址改为我们服务器的地址)直接上处理过的代码
[HttpGet]
[Route("GetImgStream")]
public HttpResponseMessage GetImgStream([FromUri]string imgUrl)
{
var ss=HttpContext.Current.Request.QueryString;
var imageBuffer = GetDownloadStream(imgUrl);
var respimg = new HttpResponseMessage(HttpStatusCode.OK)
{
Content = new System.Net.Http.ByteArrayContent(imageBuffer)
};
respimg.Content.Headers.ContentType = new MediaTypeHeaderValue("image/jpg");
return respimg;
}
/// <summary>
/// 获取网络文件的二进制流
/// </summary>
/// <param name="url">腾讯云地址(腾讯云文件的完整地址)</param>
/// <returns></returns>
public byte[] GetDownloadStream(string url)
{
try
{
// 设置参数
HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
request.Method = "get";
//发送请求并获取相应回应数据
HttpWebResponse response = request.GetResponse() as HttpWebResponse;
Stream responseStream = response.GetResponseStream();
WebHeaderCollection header = response.Headers;
string lastmodify = header["Last-Modified"];
byte[] bArr = null;
using (MemoryStream ms = new MemoryStream())
{
int b;
while ((b = responseStream.ReadByte()) != -1)
{
ms.WriteByte((byte)b);
}
bArr = ms.ToArray();
}
responseStream.Close();
responseStream.Dispose();
return bArr;
}
catch (Exception ex)
{
throw ex;
}
}
我们写一个webapi,然后把html所有的图片地址都换成我们的api地址动态获取图片,于是我们的代码改成了这样
/// <summary>
/// 处理html
/// </summary>
/// <param name="html"></param>
/// <param name="url">请求地址</param>
/// <returns></returns>
public static string HandleHtml(string html, string url)
{
var parser = new HtmlParser();
var document = parser.ParseDocument(html);
Uri myurl = new Uri(url);
//增加跨域节点
var addMetaDom = document.CreateElement("meta");
addMetaDom.SetAttribute("name", "referrer");
addMetaDom.SetAttribute("content", "never");
document.Head.Append(addMetaDom);
//追加自定义节点
var addNode = document.CreateElement("script");
addNode.TextContent = $"var newhref=\"{url}\";var newhost=\"{myurl.Host}\";var newprotocol=\"{myurl.Scheme}:\";";
document.Head.Append(addNode);
//解析link标签
var blueListItemsLinq = document.All.Where(p => p.LocalName == "link");
foreach (var item in blueListItemsLinq)
{
var oldHtml = item.OuterHtml;
var href = item.GetAttribute("href");
if (!string.IsNullOrWhiteSpace(href))
{
if (href.Length > 2)
{
if (href[0] == '/' && href[1] == '/')
{
var newHref = "https:" + href;
var replaceHtml = oldHtml.Replace(href, newHref);
item.OuterHtml = replaceHtml;
}
}
}
}
//解析iframe
var videoItemLinq = document.All.Where(p => p.LocalName == "iframe" && p.ClassName == "video_iframe rich_pages");
foreach (var item in videoItemLinq)
{
var vid = item.GetAttribute("data-mpvid");
if (string.IsNullOrWhiteSpace(vid))
{
continue;
}
var realUrl = GetRealVideo(vid);
//替换当前iframe
if (!string.IsNullOrWhiteSpace(realUrl))
{
var addvideoNode = document.CreateElement("video");
addvideoNode.SetAttribute("src", realUrl);
addvideoNode.SetAttribute("controls", "controls");
item.Parent.AppendChild(addvideoNode);
item.Parent.RemoveChild(item);
}
}
var newOuterHtml = document.DocumentElement.OuterHtml;
//全局替换 "//res
return newOuterHtml.Replace("\"//res", "\"https://res").Replace("location.protocol", "newprotocol").Replace("location.href", "newhref").Replace("location.host", "newhost")
.Replace("src=\"https://mmbiz.qpic.cn/", "src=\"http://自己的webapi地址/GetImgStream?imgUrl=http://mmbiz.qpic.cn/")
.Replace("src=\"/mp/videoplayer?", "src=\"https://mp.weixin.qq.com/mp/videoplayer?");
}
好了,图片出来了,视频也出来了,本以为大功告成,突然发现视频还有一种格式的
打开F12,我们可以看到这个视频是iframe套了一层,而且路径不能直接打开
这就是视频的加密机制,于是我们需要把视频给解密出来 ,还是看请求来分析
我们注意到这个请求,url就是真实的视频地址,而入参的id就是上面那个iframe的id,于是,我们需要把视频先解密出来,然后把Iframe替换成video标签
/// <summary>
/// 处理html
/// </summary>
/// <param name="html"></param>
/// <param name="url">请求地址</param>
/// <returns></returns>
public static string HandleHtml(string html, string url)
{
var parser = new HtmlParser();
var document = parser.ParseDocument(html);
Uri myurl = new Uri(url);
//增加跨域节点
var addMetaDom = document.CreateElement("meta");
addMetaDom.SetAttribute("name", "referrer");
addMetaDom.SetAttribute("content", "never");
document.Head.Append(addMetaDom);
//追加自定义节点
var addNode = document.CreateElement("script");
addNode.TextContent = $"var newhref=\"{url}\";var newhost=\"{myurl.Host}\";var newprotocol=\"{myurl.Scheme}:\";";
document.Head.Append(addNode);
//解析link标签
var blueListItemsLinq = document.All.Where(p => p.LocalName == "link");
foreach (var item in blueListItemsLinq)
{
var oldHtml = item.OuterHtml;
var href = item.GetAttribute("href");
if (!string.IsNullOrWhiteSpace(href))
{
if (href.Length > 2)
{
if (href[0] == '/' && href[1] == '/')
{
var newHref = "https:" + href;
var replaceHtml = oldHtml.Replace(href, newHref);
item.OuterHtml = replaceHtml;
}
}
}
}
//解析iframe
var videoItemLinq = document.All.Where(p => p.LocalName == "iframe" && p.ClassName == "video_iframe rich_pages");
foreach (var item in videoItemLinq)
{
var vid = item.GetAttribute("data-mpvid");
if (string.IsNullOrWhiteSpace(vid))
{
continue;
}
var realUrl = GetRealVideo(vid);
//替换当前iframe
if (!string.IsNullOrWhiteSpace(realUrl))
{
var addvideoNode = document.CreateElement("video");
addvideoNode.SetAttribute("src", realUrl);
addvideoNode.SetAttribute("controls", "controls");
item.Parent.AppendChild(addvideoNode);
item.Parent.RemoveChild(item);
}
}
var newOuterHtml = document.DocumentElement.OuterHtml;
//全局替换 "//res
return newOuterHtml.Replace("\"//res", "\"https://res").Replace("location.protocol", "newprotocol").Replace("location.href", "newhref").Replace("location.host", "newhost")
.Replace("src=\"https://mmbiz.qpic.cn/", "src=\"http://自己的webapi地址/GetImgStream?imgUrl=http://mmbiz.qpic.cn/")
.Replace("src=\"/mp/videoplayer?", "src=\"https://mp.weixin.qq.com/mp/videoplayer?");
}
/// <summary>
/// 根据vid获取公众号视频的真实地址
/// </summary>
/// <param name="vid"></param>
/// <returns></returns>
public static string GetRealVideo(string vid)
{
string txUrl = $"https://mp.weixin.qq.com/mp/videoplayer?action=get_mp_video_play_url&preview=0&__biz=MzI4NDk4OTQxMg==&mid=2247485200&idx=1&vid={vid}&uin=&key=&pass_ticket=&wxtoken=777&appmsg_token=&x5=0&f=json";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(txUrl);
request.Method = "GET";
var backJson = GetBackHtml(request, "1");
var realUrl = JsonConvert.DeserializeObject<VideoSerializeModel>(backJson);
if (realUrl == null || realUrl.videoinfos.Count() == 0)
{
throw new Exception("视频解析错误");
}
return realUrl.videoinfos[0].url;
}
public class VideoSerializeModel
{
/// <summary>
/// 视频标题
/// </summary>
public string title { get; set; }
[JsonProperty(PropertyName = "url_info")]
public List<VideoModel> videoinfos = new List<VideoModel>();
}
public class VideoModel
{
public string duration_ms { get; set; }
public string filesize { get; set; }
public string format_id { get; set; }
public string height { get; set; }
public string url { get; set; }
public string width { get; set; }
}
然后大功告成,我们就可以根据公众号网页的url 直接放到我们自己的网站上了