from urllib.parse import urlencode
import requests
from pyquery import PyQuery as pq
import json
import io
from pymongo import MongoClient
ccon = MongoClient(host='localhost',port=27017)
db = ccon['weibo']
collection = db['weibo']
base_url = 'https://m.weibo.cn/api/container/getIndex?'
# url_1 = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=2830678474&containerid=1076032830678474'
# url_2 = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=2830678474&containerid=1076032830678474&since_id=4506840937246905'
id_list = []
headers = {
'Host':'m.weibo.cn',
'Referer':'https://m.weibo.cn/u/2830678474',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(since_id):
params = {
'type':'uid',
'value':'2705604295',
'containerid':'1076032705604295',
'since_id':since_id
}
url = base_url + urlencode(params)
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error',e.args)
def parse_page(json1):
if json1:
items = json1.get('data').get('cards')
for item in items:
item = item.get('mblog')
weibo ={}
weibo['id'] = item.get('id')
weibo['text'] = pq(item.get('text')).text()
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
weibo['created_at'] = item.get('created_at')
yield weibo
def qude_id(num):
# if since_id = '':
# response = requests.get(url_2)
# since_id = since_id = (response.json()['data']['cardlistInfo']['since_id']) # 取出第一个since_id
# return since_id
url_2 = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=2830678474&containerid=1076032830678474&since_id=4506840937246905'
url_3 = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=2705604295&containerid=1076032705604295'
while 1 < num:
response = requests.get(url_3)
since_id = (response.json()['data']['cardlistInfo']['since_id']) # 取出第一个since_id
# id_list.append(since_id)
json1 = get_page(since_id)
returns = parse_page(json1)
print(type(returns))
for return1 in returns:
# print(type(return1))
# with open('weibo.docx','a+',errors = 'ignore') as f:
# # str_json = =json.dumps(return1)
# f.write(json.dumps(return1,indent=2,ensure_ascii=False))
if collection.insert(return1):
print('Saved to Mongo')
url_3 = url_3 + '&since_id=' + str(since_id)
num -= 1
if __name__ == "__main__":
# for i in range(30):
# qude_id(30)
# print(id_list)
# for id in id_list:
# json = get_page(id)
# returns = parse_page(json)
# for return1 in returns:
# print(return1)
for i in collection.find():
print(i['text'])
爬取新浪微博发的微博,并保存到数据库(MongoDB)
最新推荐文章于 2020-10-11 14:40:42 发布