php rsa模拟登录微博,微博模拟登录爬虫

该博客介绍了一个使用Python编写的微博爬虫,通过登录、获取数据等步骤,实现了对特定用户微博的抓取。代码中涉及到了requests、BeautifulSoup、rsa等库,用于模拟登录、加密密码及页面内容解析。最终将抓取的数据保存到CSV文件。
摘要由CSDN通过智能技术生成

起因

[email protected],同时她附了源码,我下载源码之后发现没办法运行,简单调试了一下,目前可以正常获取数据。现在把修改的代码贴在下面

# -*- coding: utf-8 -*-

"""

Created on Fri Jul 12 16:08:41 2019

@author: Single

"""

import requests

import urllib

import base64

import time

import re

import json

import rsa

import binascii

from bs4 import BeautifulSoup

import pandas as pd

import numpy as np

from requests.packages.urllib3.connectionpool import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

header = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',

'Accept-Encoding': 'gzip, deflate, br',

'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Referer': 'https://weibo.com/?sudaref=www.baidu.com&display=0&retcode=6102',

'Connection': 'keep-alive'

}

class Login(object):

session = requests.session()

user_name = "17526922476"

pass_word = "zhuyilong"

def get_username(self):

# request.su = sinaSSOEncoder.base64.encode(urlencode(username));

return base64.b64encode(urllib.parse.quote(self.user_name).encode("utf-8")).decode("utf-8")

def get_pre_login(self):

# 取servertime, nonce,pubkey

# int(time.time() * 1000)

# 避免重复向weibo发请求(可能有些动态反爬,反复发请求会失败)

if hasattr(self, 'pre_login'):

return getattr(self, 'pre_login')

params = {

"entry": "weibo",

"callback": "sinaSSOController.preloginCallBack",

"su": self.get_username(),

"rsakt": "mod",

"checkpin": "1",

"client": "ssologin.js(v1.4.19)",

"_": int(time.time() * 1000)

}

try:

response = self.session.post("https://login.sina.com.cn/sso/prelogin.php", params=params, headers=header,

verify=False)

res = json.loads(re.search(r"\((?P.*)\)", response.text).group("data"))

self.pre_login = res

return res

except Exception as e:

print(e)

print("获取公钥失败")

return 0

def get_password(self):

# RSAKey.setPublic(me.rsaPubkey, "10001");

# password = RSAKey.encrypt([me.servertime, me.nonce].join("\t") + "\n" + password)

public_key = rsa.PublicKey(int(self.get_pre_login()["pubkey"], 16), int("10001", 16))

password_string = str(self.get_pre_login()["servertime"]) + '\t' + str(

self.get_pre_login()["nonce"]) + '\n' + self.pass_word

return binascii.b2a_hex(rsa.encrypt(password_string.encode("utf-8"), public_key)).decode("utf-8")

def login(self):

post_data = {

"entry": "weibo",

"gateway": "1",

"from": "",

"savestate": "7",

"qrcode_flag": "false",

"useticket": "1",

"vsnf": "1",

"su": self.get_username(),

"service": "miniblog",

"servertime": self.get_pre_login()["servertime"],

"nonce": self.get_pre_login()["nonce"],

"pwencode": "rsa2",

"rsakv": self.get_pre_login()["rsakv"],

"sp": self.get_password(),

"sr": "1536*864",

"encoding": "UTF-8",

"prelt": "529",

"url": "https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack",

"returntype": "TEXT"

}

login_data = self.session.post("https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)",

data=post_data, headers=header, verify=False)

params = {

"ticket": login_data.json()['ticket'],

"ssosavestate": int(time.time()),

"callback": "sinaSSOController.doCrossDomainCallBack",

"scriptId": "ssoscript0",

"client": "ssologin.js(v1.4.19)",

"_": int(time.time() * 1000)

}

self.session.post("https://passport.weibo.com/wbsso/login", params=params, verify=False, headers=header)

return self.session

login = Login()

session = login.login()

def get_page_session(date):

time.sleep(2)

return session.post(

"https://s.weibo.com/weibo/?q=%E6%9C%B1%E4%B8%80%E9%BE%99&typeall=1&suball=1×cope=custom:2019-07-01-4:2019-07-01-5&Refer=g&page=1",

verify=False, headers=header)

def get_data_session(date, page):

time.sleep(2)

return session.post(

"https://s.weibo.com/weibo/?q=%E6%9C%B1%E4%B8%80%E9%BE%99&typeall=1&suball=1×cope=custom:2019-07-01-4:2019-07-01-5&Refer=g&page=2",

verify=False, headers=header)

def get_page_res(date):

try:

return get_page_session(date)

except:

try:

return get_page_session(date)

except:

print("获取页码信息失败", date)

return 0

def get_data_res(date, page):

try:

return get_data_session(date, page)

except:

try:

return get_data_session(date, page)

except:

print("获取页码信息失败", date, page)

return 0

def get_page(date):

response = get_page_res(date)

if response:

try:

soup = BeautifulSoup(response.text, "html.parser")

pages = soup.find("ul", "s-scroll").find_all("li")

return len(pages)

except:

print("获取页码失败", date)

return 0

def get_data(date, page):

response = get_data_res(date, page)

data = list()

if response:

try:

soup = BeautifulSoup(response.text, "html.parser")

infos = soup.find_all('div', "content")

records = soup.find_all("div", "card-act")

for info, record in zip(infos[0:], records[0:]):

times = "".join(info.find('p', 'from').text)

data.append("".join(times.split()[0]))

data.append("".join(times.split()[1]))

user = info.find('a', "name")

data.append("".join(user.text))

content = info.find('p', "txt")

data.append("".join(content.text.strip().replace(' \u200b', '')))

recs = record.find_all('li')

data.append("".join(re.findall(r'转发 (.+?)', recs[1].text, re.S)))

data.append("".join(re.findall(r'评论 (.+?)', recs[2].text, re.S)))

data.append("".join(recs[3].text.split()))

except:

pass

return data

def save_data_to_csv(data):

data = np.array(data).reshape(-1, 7)

result_weibo = pd.DataFrame(data)

result_weibo.to_csv(data_file_name, index=False, encoding='gb18030', header=False)

data_file_name = "./朱一龙.csv"

column = pd.DataFrame(columns=['日期', '时间', '用户', '内容', '转发', '评论', '点赞'])

column.to_csv(data_file_name, index=False, encoding='gb18030')

save_data_to_csv(get_data(2019, 5))

原文:https://www.cnblogs.com/forcee/p/12497345.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值