模拟手机端爬取博主微博

最新推荐文章于 2022-10-26 15:24:04 发布

Xiaoweidumpb

最新推荐文章于 2022-10-26 15:24:04 发布

阅读量611

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_43751489/article/details/109462153

版权

python 专栏收录该内容

23 篇文章 0 订阅

订阅专栏

'https://m.weibo.cn/api/container/getIndex?type=uid&value=2970452952&containerid=1076032970452952'
'https://m.weibo.cn/api/container/getIndex?type=uid&value=2970452952&containerid=1076032970452952&since_id=4519809071656587'
import requests
from pyquery import PyQuery as pq




class weibo():
    url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=2970452952&containerid=1076032970452952&'
    headers = {
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
    }
    params = {
        'since_id': None
    }
    items = {}
    t=1

    def catch_page(self,num):
        for i in range(num):
            res = requests.get(url=self.url, headers=self.headers, params=self.params)
            length = len(res.json()['data']['cards'])
            self.params['since_id'] = res.json()['data']['cardlistInfo']['since_id']
            print(self.params['since_id'])
            print(length)
            for l in range(self.t, length):
                self.items[pq(res.json()['data']['cards'][l]['mblog']['text']).text()] = pq(res.json()['data']['cards'][l]['scheme'])
            self.t=self.t-1
            self.pri()
    def pri(self):
        for  key in self.items.keys():
            print(key)
            print('---------------------------------------------------------------\n')


if __name__ == '__main__':
    weibo().catch_page(5)


'https://m.weibo.cn/api/container/getIndex?type=uid&value=2970452952&containerid=1076032970452952'
'https://m.weibo.cn/api/container/getIndex?type=uid&value=2970452952&containerid=1076032970452952&since_id=4519809071656587'
import random
import time
import pymongo
from bson.objectid import ObjectId
import requests
from pyquery import PyQuery as pq

class Weibo():
    url ='https://m.weibo.cn/api/container/getIndex?uid=2970452952&t=0&luicode=10000011&lfid=100103type%3D1%26amp%3Bq%3D%E6%9D%8E%E5%AD%90%E6%9F%92&containerid=1076032970452952&'
    #url=   'https://m.weibo.cn/api/container/getIndex?uid=5295256115&t=0&luicode=10000011&lfid=100103type%3D1%26amp%3Bq%3D%E6%9D%8E%E7%88%86%E9%A6%99L&type=uid&value=5295256115&containerid=1076035295256115'
    headers = {
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
        #'cookie': 'WEIBOCN_FROM=1110006030; _T_WM=42849003436; SCF=Ah2TsZ-I-daaRVDNYTfpUcMA67a2LjSOFNGINsVFKeKqLnfrCHjB6pU1uTBiG8Ei1nyDIkurdgZ6ZQ_QRqwfat0.; SUB=_2A25yoGAyDeRhGeNP6VYU-S7KwzSIHXVuawB6rDV6PUJbktAKLXP3kW1NSfmE-wYTQtGREDJEB1YMZCV2eQqewd63; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5MIQ8x.zBYcQZcxy6gf6VJ5JpX5K-hUgL.Fo-peoBf1K5c1hn2dJLoI7puIc_uIcLVMcLrIGBt; SSOLoginState=1604587618; ALF=1607179618; XSRF-TOKEN=b477ce; MLOGIN=1; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526amp%253Bq%253D%25E6%259D%258E%25E5%25AD%2590%25E6%259F%2592%26uicode%3D10000011%26fid%3D1076032970452952'
    }
    params = {
        'since_id': None
    }
    count=0
    def catch_page(self):

        #根据分析,since_id为空时爬取微博结束
        while(True):
            time.sleep(random.randint(2, 5))
            res = requests.get(url=self.url, headers=self.headers, params=self.params)
            #print(res.text)
            length = len(res.json()['data']['cards'])
            if('since_id'not in res.json()['data']['cardlistInfo']):
                break
            self.params['since_id'] = res.json()['data']['cardlistInfo']['since_id']
            try:
                for l in range(0, length):
                    items = {}
                    items['text']=pq(res.json()['data']['cards'][l]['mblog']['text']).text()
                    print(pq(res.json()['data']['cards'][l]['mblog']['text']).text())
                    items['origin_url']= res.json()['data']['cards'][l]['scheme']
                    items['time'] =res.json()['data']['cards'][l]['mblog']['created_at']
                    #img_length = res.json()['data']['cards'][l]['mblog']['pic_num']
                    # for l1 in range(0,img_length):
                    #     time.sleep(random.randint(2,5))
                    #     items[str(l1)]=requests.get(url=pq(res.json()['data']['cards'][l]['mblog']['pics'])[l1]['large']['url'],headers=self.headers).content
                    self.count=self.count+1
                    yield  items
            except Exception as e:
                print(e)
                pass
        print(f"一共爬取了{self.count}条数据",)
    def set_mongodb(self,dict_):
        connection = pymongo.MongoClient('81.69.9.31')
        db = connection.weibo
        try:
            db.liziqi_up.insert_one(dict_
            )
            #print(f"成功插入第{self.count}条数据")
        except  Exception as e1:
            print(e1)
    def get_mongodb(self):
        connection = pymongo.MongoClient('81.69.xx.xx')
        db = connection.weibo
        my_set=db.up
        #res=my_set.find({time:'01-01'},{0:1})
        data=my_set.find_one({'_id':ObjectId('5fa49acf6c51fac720c5fb3a')})
        print(data['0'])
        with open('./demo01.jpg','wb') as file:
            file.write(data['0'])


if __name__ == '__main__':
    weibo=Weibo()
    for dict_ in weibo.catch_page():
        weibo.set_mongodb(dict_)
    #weibo.get_mongodb()

Xiaoweidumpb

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
模拟手机端爬取博主微博

'https://m.weibo.cn/api/container/getIndex?type=uid&value=2970452952&containerid=1076032970452952''https://m.weibo.cn/api/container/getIndex?type=uid&value=2970452952&containerid=1076032970452952&since_id=4519809071656587'import reque
复制链接

扫一扫

专栏目录