简单的python3.5爬取知乎用户基本信息

简单的知乎用户信息爬取。  
用到python3.5,mysql数据库,需要自行准备好环境  
代码在windows上测试成功(没钱买mac,逃)  
插入数据库时,写了两个版本,一个正常查询,一个协程查询(主要比较看看这两者的性能如何)  
由于是个python渣,代码有点乱。  

好了不多废话,现在开始:  

全局变量:

# 提交头数据
headers = {
    "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/41.0',
    "Referer": "http://www.zhihu.com/",
    'Host': 'www.zhihu.com',
}
# 保存cookie的文件名
filename = 'cookie'
# 建立一个session
session = requests.Session()
# 建立LWPCookieJar实例,可以存Set-Cookie3类型的文件。
session.cookies = http.cookiejar.LWPCookieJar(filename)

模拟登陆知乎:  

# 登陆
def login():
    # 第一次需要输入自己的账号密码
    username = input('输入账号:')
    password = input('输入密码:')

    #有@符号为邮箱登陆
    if "@" in username:
        print('使用邮箱登录中...')
        url = 'https://www.zhihu.com/login/email'
        data = {'_xsrf': get_xsrf(),
                'password': password,
                'remember_me': 'true',
                'email': username
                }
    else:
        print('使用手机登录中...')
        url = 'http://www.zhihu.com/login/phone_num'
        data = {'_xsrf': get_xsrf(),
                'password': password,
                'remember_me': 'true',
                'phone_num': username
                }
    # 若不用验证码,直接登录
    try:
        result = session.post(url, data=data, headers=headers)
        print((json.loads(result.text))['msg'])
    # 要用验证码,post后登录
    except:
        data['captcha'] = get_captcha()
        print(data)
        result = session.post(url, data=data, headers=headers)
        print(result.text)
        print((json.loads(result.text))['msg'])
    # 保存cookie到本地
    session.cookies.save(ignore_discard=True, ignore_expires=True)


获取xsrf:

# 获取xsrf
def get_xsrf():
    response = session.get('https://www.zhihu.com', headers=headers)
    html = response.text
    get_xsrf_pattern = re.compile(r'<input type="hidden" name="_xsrf" value="(.*?)"')
    _xsrf = re.findall(get_xsrf_pattern, html)[0]
    return _xsrf


获取验证码: 

# 获取验证码
def get_captcha():
    t = str(int(time.time() * 1000))
    captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
    response = session.get(captcha_url, headers=headers)
    with open('cptcha.gif', 'wb') as f:
        f.write(response.content)
    im = Image.open('cptcha.gif')
    im.show()
    captcha = input('请输入验证码: ')
    return captcha


解析出用户基本信息(由于知乎页面随时在变,所以程序不能用多半是这里解析出了的问题。请自行解析,可以参照 BeautifulSoup): 

# 获取用户基本信息
def get_userInfo(userID):
    user_url = 'https://www.zhihu.com/people/' + userID
    response = session.get(user_url, headers=headers)

    # 由于我在window上编程所以只能用html5lib,有钱了去买了个mac
    soup = BeautifulSoup(response.content, 'html5lib')

    # 打印出页面
    # with open('zhihuprifile.html', 'wb') as f:
    #     f.write(response.content)

    d = {}
    # userId = soup.find("a",class_="Tabs-link")["href"].split("/")[2]
    d['userId'] = userID

    try:
        nickname = soup.find_all('span', {'class': 'ProfileHeader-name'})[0].string
    except:
        nickname = "None"
    d['nickname'] = nickname


    try:
        word = soup.find('span', class_="RichText ProfileHeader-headline").string
        if word == None:
            word = 'None'
    except:
        word = "None"
    d['word'] = word

    try:
        business = soup.find_all('div', {'class': 'ProfileHeader-iconWrapper'})[0].next_sibling
        if business == None:
            business = 'None'
    except:
        business = 'None'
    d['business'] = business

    try:
        company = soup.find_all('div', {'class': 'ProfileHeader-divider'})[0].next_sibling
        if company == None:
            company = 'None'
    except:
        company = 'None'
    d['company'] = company

    try:
        location = soup.find_all('div', {'class': 'ProfileHeader-divider'})[1].next_sibling
        if location == None:
            location = 'None'
    except:
        location = "None"
    d['location'] = location

    try:
        school = soup.find_all('div', {'class': 'ProfileHeader-iconWrapper'})[1].next_sibling
        if school == None:
            school = 'None'
    except:
        school = 'None'
    d['school'] = school

    try:
        subject = soup.find_all('div', {'class': 'ProfileHeader-divider'})[2].next_sibling
        if subject == None:
            subject = 'None'
    except:
        subject = 'None'
    d['subject'] = subject

    try:
        # 分割错误说明没有“回答”,会报错
        answers = soup.find('div', {'class': 'IconGraf-iconWrapper'}).next_sibling.split(' ')[1]
    except:
        answers = None
    if answers == None:
        answers = 0
    d['answers'] = answers

    try:
        followees = soup.find_all('div', {'class': 'Profile-followStatusValue'})[0].string
    except:
        followees = None
    if followees == None:
        followees = 0
        # print('followees: %s' % followees)
    d['followees'] = followees

    try:
        followers = soup.find_all('div', {'class': 'Profile-followStatusValue'})[1].string
    except:
        followers = None
    if followers == None:
        followers = 0
    d['followers'] = followers

    return d


获取用户关注他的人,也可能需要自己解析:

# 获取关注者的主页url,只获取前三个
def followeesUrl(userId):
    user_url = 'https://www.zhihu.com/people/' + userId + "/following"
    response = session.get(user_url, headers=headers)

    # 由于我在windows上编程所以只能用html5lib,有钱了去买了个mac
    soup = BeautifulSoup(response.content, 'html5lib')
    # with open('following.html', 'wb') as f:
    #     f.write(response.content)
    urls = soup.find_all("div", {'aria-haspopup': "true"})

    # 保存url,去掉重复的
    urllist = set([])
    for url in urls:
        urllist.add(url.a["href"])

    # 拼接为字符串返回
    saveUrl = ''
    for u in urllist:
        if saveUrl != '':
            saveUrl = saveUrl + "," + u
        else:
            saveUrl = u
    return saveUrl


以下是对mysql操作:

# 数据库链接信息,填入自己的数据库信息
conn = mysql.connector.connect(host='localhost', user='****', password='****', database='zhihu')


# 存取数据到数据库
def saveInfo(info):
    cursor = conn.cursor()
    data = [str(info.get("userId")), str(info.get("nickname")), str(info.get("word")), str(info.get("business")),
            str(info.get("company")), str(info.get("location")), str(info.get("school")), str(info.get("subject")),
            int(info.get("answers")), int(info.get("followers")), int(info.get("followees")), info.get("f_url")]
    try:
        cursor.execute(
            "insert into zhihu(userId,nickname,word,business,company,location,school, subject,answers,followers,followees,f_url) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
            data)
    except:
        #print("已经存储")
        pass

    conn.commit()
    cursor.close()


# 修改状态,1表示已经查询过
def changeSate(userId):
    cursor = conn.cursor()
    # sql = "updata zhihu set flag = 1 where userId = " + userId
    # cursor.execute(sql)
    data = [userId]
    cursor.execute("update zhihu set flag = 1 where userId = %s", data)
    conn.commit()
    cursor.close()


# 查询url
def selectOneUrl():
    cursor = conn.cursor()
    flag = [0, ]
    # 查询出一条未爬取的数据即可
    cursor.execute("select userId,f_url from zhihu where flag = %s limit 1", flag)
    data = cursor.fetchall()
    userId = data[0][0]
    url = data[0][1]
    urlList = []
    for u in url.split(","):
        urlList.append(u)
    # 放入对象中返回
    urlOne = SelectOne(userId, urlList)
    conn.commit()
    cursor.close()
    return urlOne



以下为具体操作(数据库中持续存储):

#-------------------操作方法----------------------------
def threadingExecution():
    # 从数据库中查询没有被爬取的数据
    urltemp = None
    try:
        urltemp = selectOneUrl()
    except:
        print("用户查询完毕")
    userId = urltemp.get_userId()
    urls = urltemp.get_urlList()
    if urls != None and urls != '':
        for u in urls:
            try:
                id = u.split("/")[2]
            except:
                continue
            info = get_userInfo(id)

            nickname = info['nickname']
            if nickname == 'None':
                continue
            print(nickname)
            info['f_url'] = followeesUrl(info['userId'])
            saveInfo(info)
            # 睡5秒后再请求,我怕请求太快,被限制
            time.sleep(5)
    changeSate(userId)

# 批量获取信息
def selectMessage():
    while True:
        #开始运行时间
        start = time.time()
        t = threading.Thread(target=threadingExecution())
        t.setDaemon(True)
        t.start()
        t.join(30)
        #结束时间
        end = time.time()
        #超时退出循环
        if (end-start) > 29:
            break

    print("超时----")
    #退出循环后在次调用

    #睡三分钟后重连
    time.sleep(180)

    session.cookies.load(filename=filename, ignore_discard=True)
    selectMessage()


# 存放从数据库中查询出来的信息
class SelectOne:
    def __init__(self, userId, urlList):
        self.__userId = userId
        self.__urlList = urlList

    def get_userId(self):
        return self.__userId

    def get_urlList(self):
        return self.__urlList

    def set_userId(self, userId):
        self.__userId = userId

    def set_urlList(self, urlList):
        self.__urlList = urlList

数据库截图:





最后,代码里面都有详细的注释。获取sql文件,和完整代码请到github上获取

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值