前言:这里关键写用WebBrowser跨域跨iframe获取网页源码的部分,本意是要爬取全职高手的有声小说,这类网站特殊,网页上广告大堆,爬起来真麻烦,比如我爬取的网站的mp3源文件下载还需要秘钥的,还随时更新秘钥的那种,嵌在某个iframe下,是经过设计防止爬虫的。
上代码,获取我想要的关键iframe源码(各种百度搬砖拆砖的结果/苦笑):
using mshtml;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace DownLoadNovel
{
public class FindHtml
{
private String htmlString;
private String url = "";
private bool success; // 是否成功运行
public FindHtml()
{
htmlString = "";
success = false;
}
/// <summary>
/// 结果源码
/// </summary>
public String ResultHtml
{
get
{
if (success == false) return null;
return htmlString;
}
}
/// <summary>
/// 加载指定文件
/// </summary>
/// <param name="url">文件URL</param>
/// <param name="timeOut">超时时限</param>
/// <returns>是否成功运行,没有超时</returns>
public bool Run(String url, int timeOut = 10000)
{
this.url = url;
Thread newThread = new Thread(NewThread);
newThread.SetApartmentState(ApartmentState.STA);/// 为了创建WebBrowser类的实例 必须将对应线程设为单线程单元
newThread.Start();
//监督子线程运行时间
while (newThread.IsAlive && timeOut > 0)
{
Thread.Sleep(100);
timeOut -= 100;
}
// 超时处理
if (newThread.IsAlive)
{
if (success) return true;
newThread.Abort();
return false;
}
return true;
}
private void NewThread()
{
new FindHtmlPerThread(this);
Application.Run();// 循环等待webBrowser 加载完毕 调用 DocumentCompleted 事件
}
/// <summary>
/// 用于处理一个url的核心类
/// </summary>
class FindHtmlPerThread : IDisposable
{
FindHtml master;
WebBrowser web;
public FindHtmlPerThread(FindHtml master)
{
this.master = master;
String url = master.url;
web = new WebBrowser();
web.ScriptErrorsSuppressed = true;
bool success = false;
try
{
web.Url = new Uri(url);
web.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(web_DocumentCompleted); // 对事件加委托
succ