最近开发一个项目,甲方提出在另一个网站中采集信息列表并跳转,于是就简单做了一个信息列表采集功能
/// <summary>
/// 采集视频列表
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
public static void getVideos(object sender, System.Timers.ElapsedEventArgs e)
{
try
{
string strHtml=GetWholeHtmlCode("<span style="font-family: Arial, Helvetica, sans-serif;">http://www.xxx.com/</span>");
<span style="white-space:pre"> </span>//包含列表的内容截取
strHtml = strHtml.Substring(strHtml.IndexOf("<div class='x#xxd'>"));
strHtml = strHtml.Substring(0, strHtml.IndexOf("<div id=Footer>"));
Regex reg = new Regex(@"(?is)<a[^>]+?href=(['""]?)(?<url>[^'""\s>]+).+?title=(['""]?)(?<title>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
string str = "";
MatchCollection mc = reg.Matches(strHtml);
foreach (Match m in mc)
{
try
{
string url = m.Groups["url"].Value;
string title = m.Groups["title"].Value;
if (!url.Contains("http:/"))
{
url = "http://www.xxx.com/" + m.Groups["url"].Value;
}
// 如果url已经添加了,则中断循环
string sql = " SELECT [Path] FROM [ArticInfo] where [ArticPath]='"+url+"'";
object obj = SqlHelper.ExecuteScalar(Art_conn, CommandType.Text, sql);
if (obj != null)
{
continue;
}
<span style="white-space:pre"> </span>//添加到表中
AddHtmlToArt(url, title);
}
catch
{
continue;
}
}
}
catch
{
}
}
<pre name="code" class="csharp">/// <summary>
/// 根据url获取html
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
public static string GetWholeHtmlCode(string url)
{
string strHtml = string.Empty;
StreamReader strReader = null;
HttpWebResponse wrpContent = null;
try
{
HttpWebRequest wrqContent = (HttpWebRequest)WebRequest.Create(url);
wrqContent.Timeout = 300000;
wrpContent = (HttpWebResponse)wrqContent.GetResponse();
if (wrpContent.StatusCode != HttpStatusCode.OK)
{
strHtml = "Sorry, the web page is not run successful";
}
if (wrpContent != null)
{
strReader = new StreamReader(wrpContent.GetResponseStream(), Encoding.UTF8);
strHtml = strReader.ReadToEnd();
}
}
catch (Exception e)
{
strHtml = e.Message;
}
finally
{
if (strReader != null)
strReader.Close();
if (wrpContent != null)
wrpContent.Close();
}
return strHtml.Replace("\r", "").Replace("\n", "");
}