python获取本人关注列表并批量存入本地mysql数据库

先模拟登陆,将cookies保存到本地。代码中获得知乎关注列表的链接是16年知乎电脑网页版改版以前的,返回一组json数据,下拉自动填充网页,需要传xsrf 、hash_id。2016年11月左右知乎改版后有了新的api,新的api不需要获取xsrf和hashid,只要有内部的name就可以,不过返回的数据中没有了赞同数和提问问题数。

mysql批量插入用的是pymsql的executemany方法。

import http.cookiejar
import requests
import re
import json
import math
import time
import pymysql.cursors
from zhihu.author import Author
from bs4 import BeautifulSoup
from collections import deque

deque=deque()
agent = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) " \
        "Chrome/46.0.2490.76 Mobile Safari/537.36"
headers = {
    'User-Agent': agent
}
#zui-jiu-qing-niu-4
def getsession():
    session = requests.session()
    session.cookies = http.cookiejar.LWPCookieJar(filename="cookies")
    session.cookies.load(ignore_discard=True)
    return session

def followers_num(session,id):
    res = session.get('https://www.zhihu.com/people/'+id+'/followees', headers=headers)
    bs = BeautifulSoup(res.text, 'html.parser')
    fonum = bs.find("div", {'class': 'zu-main-sidebar'}).find('a', {'class': 'item'}).find('strong').text
    fonum = int(fonum)
    return fonum

def get_xsrf(session):
    '''''_xsrf 是一个动态变化的参数'''
    index_url = 'http://www.zhihu.com'
    # 获取登录时需要用到的_xsrf
    index_page = session.get(index_url, headers=headers)
    html = index_page.text
    pattern = r'name="_xsrf" value="(.*?)"'
    # 这里的_xsrf 返回的是一个list
    _xsrf = re.findall(pattern, html)
    return str(_xsrf[0])

def getfollwer(fonum,session,xsrf):
    begin = 0
    end = math.ceil(fonum / 20)
    num = 1
    for x in range(0, end):
        beginnum = str(x * 20)
        # print(x)
        postdata = {'method': 'next',
                    'params': '{"offset":' + beginnum + ',"order_by":"created","hash_id":"29d75b4013b4631aaf7fe5848f3f6113"}',
                    '_xsrf': xsrf}
        ress = session.post('https://www.zhihu.com/node/ProfileFolloweesListV2', data=postdata, headers=headers)
        jsons = json.loads(ress.content.decode('utf-8'))
        print(jsons['msg'])
        time.sleep(5)
        for a in jsons['msg']:
            #print(a)
            abs = BeautifulSoup(a, 'html.parser')
            print(num)
            name=abs.find('a', {'class': 'zg-link author-link'}).text
            print("用户:" + name)
            homepage = abs.find('a', {'class': 'zg-link author-link'})['href']
            id = homepage[29:]
            print(id)
            #print("用户主页:" + homepage)

            normals = abs.find_all(a, {'class': 'zg-link-gray-normal'})
            follower_num=int(abs.find('a', {'href': '/people/' + id + '/followers'}).text[:-4])
            print(follower_num)
            question_num=int(abs.find('a', {'href': '/people/' + id + '/asks'}).text[:-3])
            print(question_num)
            answer_num=int(abs.find('a', {'href': '/people/' + id + '/answers'}).text[:-3])
            print(answer_num)
            agree_num=int(abs.find('a', {'href': '/people/' + id, 'class': 'zg-link-gray-normal'}).text[:-3])
            print(agree_num)
            author=Author(id,name,homepage,follower_num,question_num,answer_num,agree_num)
            deque.append(author)
            print(author.name)
            print(author.homepage)
            num = num + 1
            print("================================================================================================")
    return deque

def insertzhihu(deque):
    connetion=pymysql.connect(host='localhost',
                                 user='root',
                                 password='159366',
                                 db='zhihu',
                                 charset='utf8',
                                 cursorclass=pymysql.cursors.DictCursor)
    values=[]
    for author in deque:
        value=(author.id,author.name,author.homepage,author.follower_num,author.question_num,
               author.answer_num,author.agree_num)
        values.append(value)
    cursor=connetion.cursor()
    cursor.executemany("insert into zhihu_author values(%s,%s,%s,%s,%s,%s,%s)",values)
    connetion.commit()


if __name__ == "__main__":
    session=getsession()
    xsrf=get_xsrf(session)
    fnum=followers_num(session,"zui-jiu-qing-niu-4")
    deque=getfollwer(fnum,session,xsrf)
    insertzhihu(deque)

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值