题目要求:指定新浪微博的一个话题,如“#圆明园兽首#”,爬取指定话题下对应的所有微博内容。其中,每条微博内容包括:昵称、文本内容、连接地址、图片地址、视频地址等。必须按XML格式输出,支持翻页下载,并可同时爬取多个话题。
首先遇到的问题就是新浪微博主页面的HTML代码获取,在新浪微博的HTML源代码里,看不到任何的微博内容,应该是属于动态加载的,不能直接获取(惭愧,本人不善于网站架构,不明白新浪微博内容如何加载至页面,如果你看到这篇文章,如果你正好知道,请在下方留言,还望赐教!)。查询得知新浪微博提供官方API,在复杂的申请历程过后,使用C#语言调用API发现返回错误,查找原因才发现微博话题属于API高级查询内容,普通申请者无法获取,至此陷入瓶颈,想写爬虫结果连数据都还没看到,讽刺啊。。。
晚上11点左右回宿舍公交车上,玩手机的同时灵机一动想到手机页面的微博内容和电脑页面应该是一样的,但是手机页面应当好获取。白天做尝试,手机微博页面(weibo.cn)果然是同步新浪微博的,而且在其HTML页面中可以获取到微博内容。自此思路清晰,先获取页面代码,之后正则表达式匹配结果,最后写XML文件输出。
观察手机微博话题页面,基本就是“HTTP前缀+话题”这种形式,但是话题需要转换为utf-8编码形式。将用户输入的话题转换为utf-8编码,拼上前缀形成url地址,使用webBrowser控件获取返回HTML代码。因为webBrowser控件在获取HTML代码的时候有时间间隔,所以需要等其加载完成之后,启动线程下载话题。
为实现翻页下载,观看一主题页面下所有页面地址格式,也是这种“HTTP前缀+主题+页码”这种形式,所有先用正则表达式匹配总共多少页,通过for循环取到每一页的地址,进行下载。对于每一页的代码,先分别匹配到每一个微博,再分别用正则表达式匹配昵称、文本内容、连接地址、图片地址、视频地址等信息,通过函数去除标签,提炼信息,最后存储写入XML文件,本文还实现了图片的下载功能,也就是通过图片的连接将图片下载至本地服务器。下载完一页数据后将线程挂起并请求下一页数据,等待webBrowser控件下载完成后再将线程继续。
程序测试发现有的微博包含多个图片地址、视频地址等,本文程序仅取了正则匹配的第一个地址,循环遍历枚举器即可取到所有地址,我懒得写了。。。
整个项目打包上传至CSDN资源处,主要代码如下:(联系作者:liangsen1992@foxmail.com)
!!!特别说明:本程序未做新浪微博登陆操作,需要登陆后运行!
//BY 笑笔狂生
//E-mail: liangsen1992@foxmail.com
//2013.5.3
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using System.Web;
using System.Threading;
namespace 新浪微博话题爬虫
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
this.textBox1.Text = globe.topic;
}
Thread t;
string topic;
string DocumentText;
int pageTotalNum;
int currentPageNum = 0;
private void button1_Click(object sender, EventArgs e)
{
t = new Thread(new ThreadStart(downLoadMicroTopic));
topic = textBox1.Text;
byte[] buffer = Encoding.GetEncoding("utf-8").GetBytes(topic);
string str = "";
foreach (byte b in buffer) str += string.Format("%{0:X}", b);
string topicUrl = "http://weibo.cn/search/mblog/?keyword=" + str;
webBrowser1.Navigate(topicUrl);
}
private void downLoadMicroTopic()
{
StreamWriter sw = new StreamWriter(@"topic\" + textBox1.Text + ".xml");
sw.WriteLine("<topic>");
string contentPage = DocumentText;
string regexPage = "<input type=\"submit\" value=\"跳页\".*?页";
Regex rePage = new Regex(regexPage);
MatchCollection matchesPage = rePage.Matches(contentPage);
System.Collections.IEnumerator enuPage = matchesPage.GetEnumerator();
enuPage.MoveNext();
Match matchPage = (Match)(enuPage.Current);
string pageAllNum = matchPage.Value.ToString();
string[] pageAllNums = pageAllNum.Split('/');
pageAllNum = pageAllNums[2];
pageAllNum = pageAllNum.Substring(0, pageAllNum.Length - 1);
pageTotalNum = Convert.ToInt32(pageAllNum);
for (int i = 2; i <= pageTotalNum + 1; i++)
{
string content = DocumentText;
string regex = "a class=\"nk\" href=\"http://.*?收藏";
Regex re = new Regex(regex);
MatchCollection matches = re.Matches(content);
System.Collections.IEnumerator enu = matches.GetEnumerator();
while (enu.MoveNext() && enu.Current != null)
{
sw.WriteLine(" <message>");
Match match = (Match)(enu.Current);
string value = match.Value.ToString();
int zanNum = value.LastIndexOf("赞");
if (zanNum < 0)
zanNum = value.Length;
value = value.Substring(0, zanNum);
zanNum = value.LastIndexOf("<a href");
if (zanNum < 0)
zanNum = value.Length;
value = value.Substring(0, zanNum);
//昵称
string nickName;
int uselessNum = 0;
try
{
string regex1 = "a class=\"nk\" href=\"http://.*?</a>";
Regex re1 = new Regex(regex1);
MatchCollection matches1 = re1.Matches(value);
System.Collections.IEnumerator enu1 = matches1.GetEnumerator();
enu1.MoveNext();
Match match1 = (Match)(enu1.Current);
nickName = match1.Value.ToString();
uselessNum = nickName.Length;
int num1 = nickName.IndexOf('>');
nickName = nickName.Substring(num1 + 1, nickName.Length - num1 - 1);
nickName = nickName.Substring(0, nickName.Length - 4);
}
catch
{
nickName = "";
}
value = value.Substring(uselessNum, value.Length - uselessNum);
sw.WriteLine(" <name>" + nickName + "</name>");
//文本内容
string text;
uselessNum = 0;
try
{
string regex1 = "<span class=\"ctt\">.*?<a href=\"http://weibo.cn/";
Regex re1 = new Regex(regex1);
MatchCollection matches1 = re1.Matches(value);
System.Collections.IEnumerator enu1 = matches1.GetEnumerator();
enu1.MoveNext();
Match match1 = (Match)(enu1.Current);
text = match1.Value.ToString();
uselessNum = text.Length;
int num1 = text.IndexOf("</a>");
text = text.Substring(num1 + 4, text.Length - num1 - 4);
int num2 = text.IndexOf("<a");
text = text.Substring(0, num2);
text = RemoveSpaceHtmlTag(text);
}
catch
{
text = "";
}
value = value.Substring(uselessNum, value.Length - uselessNum);
sw.WriteLine(" <text>" + text + "</text>");
//连接地址
string microUrl;
uselessNum = 0;
try
{
int num1 = value.IndexOf("</span>");
microUrl = value.Substring(0, num1);
uselessNum = microUrl.Length;
int num2 = microUrl.IndexOf("</a>");
microUrl = microUrl.Substring(0, num2);
int num3 = microUrl.LastIndexOf("http://");
microUrl = microUrl.Substring(num3, microUrl.Length - num3);
int num4 = microUrl.IndexOf("\"");
microUrl = microUrl.Substring(0, num4);
}
catch
{
microUrl = "";
}
value = value.Substring(uselessNum, value.Length - uselessNum);
sw.WriteLine(" <link>" + microUrl + "</link>");
//图片地址
string imgUrl;
uselessNum = 0;
try
{
string regex1 = "img src=\"http://.*?.(jpg|gif|png|bmp|jpeg)";
Regex re1 = new Regex(regex1);
MatchCollection matches1 = re1.Matches(value);
System.Collections.IEnumerator enu1 = matches1.GetEnumerator();
enu1.MoveNext();
Match match1 = (Match)(enu1.Current);
imgUrl = match1.Value.ToString();
uselessNum = imgUrl.Length;
imgUrl = imgUrl.Substring(9, imgUrl.Length - 9);
}
catch
{
imgUrl = "";
}
value = value.Substring(uselessNum, value.Length - uselessNum);
sw.WriteLine(" <image>" + imgUrl + "</image>");
//下载
if (imgUrl != "")
{
int numPic = imgUrl.LastIndexOf('/');
string URLAddress = imgUrl.Substring(0, numPic);
string fileName = imgUrl.Substring(numPic + 1, imgUrl.Length - numPic - 1);
WebClient client1 = new WebClient();
client1.DownloadFile(imgUrl, fileName);
Stream strPic = client1.OpenRead(imgUrl);
StreamReader reader = new StreamReader(strPic);
byte[] mbyte1 = new byte[10000000];
int allmybyte = (int)mbyte1.Length;
int startmbyte = 0;
while (allmybyte != 0)
{
int m = strPic.Read(mbyte1, startmbyte, allmybyte);
if (m == 0)
{
break;
}
startmbyte = startmbyte + m;
allmybyte = allmybyte - m;
}
FileStream fs = new FileStream(@"topic\pic\" + fileName, FileMode.OpenOrCreate, FileAccess.Write);
fs.Write(mbyte1, 0, startmbyte);
fs.Close();
strPic.Close();
}
//音频地址
string audioUrl;
uselessNum = 0;
try
{
audioUrl = "";
}
catch
{
audioUrl = "";
}
value = value.Substring(uselessNum, value.Length - uselessNum);
sw.WriteLine(" <audio>" + audioUrl + "</audio>");
//视频地址
string videoUrl;
uselessNum = 0;
try
{
string regex1 = "<span class=\"kt\">视频</span>]<a href.*?</a>";
Regex re1 = new Regex(regex1);
MatchCollection matches1 = re1.Matches(value);
System.Collections.IEnumerator enu1 = matches1.GetEnumerator();
enu1.MoveNext();
Match match1 = (Match)(enu1.Current);
videoUrl = match1.Value.ToString();
uselessNum = videoUrl.Length;
videoUrl = videoUrl.Substring(0, videoUrl.Length - 4);
int num1 = videoUrl.LastIndexOf(">");
videoUrl = videoUrl.Substring(num1 + 1, videoUrl.Length - num1 - 1);
}
catch
{
videoUrl = "";
}
value = value.Substring(uselessNum, value.Length - uselessNum);
sw.WriteLine(" <video>" + videoUrl + "</video>");
sw.WriteLine(" </message>");
}
string topict = topic;
byte[] buffer = Encoding.GetEncoding("utf-8").GetBytes(topict);
string str = "";
foreach (byte b in buffer) str += string.Format("%{0:X}", b);
string topicUrl = "http://weibo.cn/search/mblog?hideSearchFrame=&keyword=" + str + "&page=" + i.ToString();
webBrowser1.Navigate(topicUrl);
currentPageNum++;
t.Suspend();
}
sw.WriteLine("</topic>");
sw.Close();
}
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
label2.Text = "本话题共" + pageTotalNum + "页";
label3.Text = "当前已下载" + currentPageNum + "页";
DocumentText = webBrowser1.DocumentText;
if (t.ThreadState == ThreadState.Unstarted)
{
t.Start();
}
if (t.ThreadState == ThreadState.Suspended)
{
t.Resume();
}
}
private string RemoveSpaceHtmlTag(string Input)
{
string input = Input;
//去html标签
input = Regex.Replace(input, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"-->", "", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"<!--.*", "", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
input = Regex.Replace(input, @"&#(\d+);", "", RegexOptions.IgnoreCase);
input = Regex.Replace(input, "an class=\"ctt\">:", "", RegexOptions.IgnoreCase);
input = Regex.Replace(input, "an class=\"ctt\">", "", RegexOptions.IgnoreCase);
input = Regex.Replace(input, " 详情:", "", RegexOptions.IgnoreCase);
input.Replace("<", "");
input.Replace(">", "");
input.Replace("\r\n", "");
//去两端空格,中间多余空格
input = Regex.Replace(input.Trim(), "\\s+", " ");
return input;
}
}
}