Python-新浪微博爬虫采集数据

最新推荐文章于 2024-05-14 05:32:52 发布

laymenISmouse

最新推荐文章于 2024-05-14 05:32:52 发布

阅读量8k

点赞数 4

分类专栏： Python 文章标签： python

本文链接：https://blog.csdn.net/u010343650/article/details/52945630

版权

想要采集新浪微博的数据，如何不做模拟登陆，情况如下：

<!DOCTYPE html>
<html>
<head>
    <meta http-equiv="Content-type" content="text/html; charset=gb2312"/>
    <title>Sina Visitor System</title>
</head>
<body>
<span id="message"></span>
<script type="text/javascript" src="/js/visitor/mini.js"></script>
<script type="text/javascript">
    window.use_fp = "1" == "1"; // 是否采集设备指纹。
    var url = url || {};
    (function () {
        this.l = function (u, c) {
            try {
                var s = document.createElement("script");
                s.type = "text/javascript";
                s[document.all ? "onreadystatechange" : "onload"] = function () {

                    if (document.all && this.readyState != "loaded" && this.readyState != "complete") {
                        return
                    }
                    this[document.all ? "onreadystatechange" : "onload"] = null;
                    this.parentNode.removeChild(this);
                    if (c) {
                        c()
                    }
                };
                s.src = u;
                document.getElementsByTagName("head")[0].appendChild(s)
            } catch (e) {
            }
        };
    }).call(url);

    // 流程入口。
    wload(function () {

        try {

            var need_restore = "1" == "1"; // 是否走恢复身份流程。

            // 如果需要走恢复身份流程，尝试从 cookie 获取用户身份。
            if (!need_restore || !Store.CookieHelper.get("SRF")) {

                // 若获取失败走创建访客流程。
                // 流程执行时间过长（超过 3s），则认为出错。
                var error_timeout = window.setTimeout("error_back()", 3000);

                tid.get(function (tid, where, confidence) {
                    // 取指纹顺利完成，清除出错 timeout 。
                    window.clearTimeout(error_timeout);
                    incarnate(tid, where, confidence);
                });
            } else {
                // 用户身份存在，尝试恢复用户身份。
                restore();
            }
        } catch (e) {
            // 出错。
            error_back();
        }
    });

    // “返回” 回调函数。
    var return_back = function (response) {

        if (response["retcode"] == 20000000) {
            back();
        } else {
            // 出错。
            error_back(response["msg"]);
        }
    };

    // 跳转回初始地址。
    var back = function() {

        var url = "http://weibo.com/zhaoliying?is_search=0&visible=0&is_tag=0&profile_ftype=1&page=2";
        if (url != "none") {
            window.location.href = url;
        }
    };

    // 跨域广播。
    var cross_domain = function (response) {

        var from = "weibo";
        if (response["retcode"] == 20000000) {

            var crossdomain_host = "login.sina.com.cn";
            if (crossdomain_host != "none") {

                var cross_domain_intr = window.location.protocol + "//" + crossdomain_host + "/visitor/visitor?a=crossdomain&cb=return_back&s=" +
                        encodeURIComponent(response["data"]["sub"]) + "&sp=" + encodeURIComponent(response["data"]["subp"]) + "&from="

最低0.47元/天解锁文章

laymenISmouse

关注

4
点赞
踩
16

收藏

觉得还不错? 一键收藏
0
评论
Python-新浪微博爬虫采集数据

接着我们马上需要做的预登陆了。登陆的时候我们需要用到其中的servertime、nonce、pubkey、rsakv字段，使用抓包我们看到链接http://i.sso.sina.com.cn/js/ssologin.js查看，复制到txt文件中,并用NodePad++打开,搜索username的加密方式，如图我看到的是：用户名username经过base64编码后得到值和登陆密码的加密方式。
复制链接

扫一扫

专栏目录