用新浪微博api收集数据有诸多限制,每小时只能调用官方api函数150次,认证也很麻烦。因此想通过爬网页的方式来收集数据。访问新浪微博用户网页首先需要登录,登录获取cookie后可直接获取网页数据,无需再次登录。获取登录cookie的方式具体如下:
2)处理新浪微博用户密码psw。若hex_sha1(pwd)表示对pwd进行sha1处理,则新浪微博用户密码的处理方式表示为hex_sha1(hex_sha1(hex_sha1(psw))
+ servertime + nonce)。
string str =
"entry=weibo&gateway=1&from=&savestate=7&useticket=1&ssosimplelogin=1&su="
+
登录账户base64值
+ "&service=miniblog&servertime=" + servertime +
"&nonce=" + nonce + "&pwencode=wsse&sp="
+
4)如果登录成功,返回页面中retcode值为0;get方式访问网页中location.replace处的地址。
保存这三次访问页面的cookie后下次再访问新浪微博用不再需要重新登录了。具体代码如下:
static CookieContainer cc = new CookieContainer();
public static int SinaLogin(string uid,
string psw, CookieContainer cc)
{
string uidbase64 = Base64Code(uid); //处理登录账户
如***@**.com string url = "http://login.sina.com.cn/sso/prelogin.php?entry=miniblog&callback=sinaSSOController.preloginCallBack&user="
+ uidbase64 + "&client=ssologin.js(v1.3.16)";
HttpWebRequest webRequest1 = (HttpWebRequest)WebRequest.Create(new
Uri(url)); //获取servertime和 nonce
webRequest1.CookieContainer = cc;
HttpWebResponse response1 =
(HttpWebResponse)webRequest1.GetResponse();
StreamReader sr1 = new StreamReader(response1.GetResponseStream(),
Encoding.UTF8);
string res = sr1.ReadToEnd();
int start = res.IndexOf("servertime");
if (start < 0 || start >= res.Count()) return -1;
int end = res.IndexOf(',', start);
if (end < 0 || end >= res.Count()) return -1;
string servertime = res.Substring(start + 12, end - start -
12);
start = res.IndexOf("nonce");
if (start < 0 || start >= res.Count()) return -1;
end = res.IndexOf(',', start);
if (end < 0 || end >= res.Count()) return -1;
string nonce = res.Substring(start + 8, end - start - 9);
string password = hex_sha1("" + hex_sha1(hex_sha1(psw)) +
servertime + nonce); //处理新浪微博用户密码psw
string str =
"entry=weibo&gateway=1&from=&savestate=7&useticket=1&ssosimplelogin=1&su="
+
uidbase64 + "&service=miniblog&servertime=" + servertime +
"&nonce=" + nonce + "&pwencode=wsse&sp=" + password +
"&encoding=utf-8&url=" +
HttpUtility.UrlEncode("http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack")
+
"&returntype=META";
byte[] bytes;
ASCIIEncoding encoding = new ASCIIEncoding();
bytes = encoding.GetBytes(str);
// bytes =
System.Text.Encoding.UTF8.GetBytes(HttpUtility.UrlEncode(str));
HttpWebRequest webRequest2 =
(HttpWebRequest)WebRequest.Create("http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.3.16)");
webRequest2.Method = "POST";
webRequest2.ContentType = "application/x-www-form-urlencoded";
webRequest2.ContentLength = bytes.Length;
webRequest2.CookieContainer = cc;
Stream stream;
stream = webRequest2.GetRequestStream();
stream.Write(bytes, 0, bytes.Length);
stream.Close();
HttpWebResponse response2 =
(HttpWebResponse)webRequest2.GetResponse();
StreamReader sr2 = new StreamReader(response2.GetResponseStream(),
Encoding.Default);
string res2 = sr2.ReadToEnd();
int pos = res2.IndexOf("retcode");
if (pos < 0 || pos > res2.Count()) return -1;
int retcode = -1;
for (pos += 8; pos < 100 + res2.Count(); pos++)
{
if (res2[pos] < '0' || res2[pos] > '9')
{
retcode = 0;
break;
}
else if (res2[pos] > '0' && res2[pos] <= '9')
break;
}
if (retcode == -1) return -1;
start = res2.IndexOf("location.replace");
end = res2.IndexOf("")", start);
url = res2.Substring(start + 18, end - start - 18);
HttpWebRequest webRequest3 = (HttpWebRequest)WebRequest.Create(new
Uri(url));
webRequest3.CookieContainer = cc;
HttpWebResponse response3 =
(HttpWebResponse)webRequest3.GetResponse();
StreamReader sr3 = new StreamReader(response3.GetResponseStream(),
Encoding.UTF8);
res = sr3.ReadToEnd();
foreach (Cookie cookie in response3.Cookies)
{
cc.Add(cookie);
}
return 0;
}
//base64加密
public static string Base64Code(string Message)
{
char[] Base64Code = new
char[]{'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T', 'U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n', 'o','p','q','r','s','t','u','v','w','x','y','z','0','1','2','3','4','5','6','7', '8','9','+','/','='};
byte empty = (byte)0;
System.Collections.ArrayList byteMessage = new
System.Collections.ArrayList(System.Text.Encoding.Default.GetBytes(Message));
System.Text.StringBuilder outmessage;
int messageLen = byteMessage.Count;
int page = messageLen / 3;
int use = 0;
if ((use = messageLen % 3) > 0)
{
for (int i = 0; i < 3 - use; i++)
byteMessage.Add(empty);
page++;
}
outmessage = new System.Text.StringBuilder(page * 4);
for (int i = 0; i < page; i++)
{