爬虫小试第三天

最新推荐文章于 2024-04-30 19:33:07 发布

bullpride

最新推荐文章于 2024-04-30 19:33:07 发布

阅读量355

点赞数

分类专栏： Python

本文链接：https://blog.csdn.net/bullpride/article/details/52185981

版权

Python 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

# coding=utf-8
import sys
import urllib2
import datetime
import random
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
links = []  # 链接
nameslist = []  # 知乎姓名
locationlist = []  # 地址
educationlist = []  # 学校
genderlist = []  # 性别 男1 女0
agreelist = []  # 获得赞同数
thankslist = []  # 获得感谢数
askslist = []  # 提问数
answerslist = []  # 回答数
postslist = []  # 文章数
collectionslist = []  # 收藏数
logslist = []  # 公共编辑数
UrlAddress = 'https://www.zhihu.com/people/zi-you-96-25'

while True:
    try:
        html = urllib2.urlopen(UrlAddress)
    except:
        randt = random.randint(0, len(links) - 1)
        UrlAddress = links[randt]
        html = urllib2.urlopen(UrlAddress)
    bsObj = BeautifulSoup(html, 'html.parser')
    users = bsObj.findAll("", {"class": 'author-link'})
    first_name = bsObj.find('title')
    n = first_name.get_text().split(' - ')[0]
    location = bsObj.find("", {"class": 'location item'})
    l = (location.attrs['title'] if location != None else "未知")
    education = bsObj.find("", {"class": 'education item'})
    e = (education.attrs['title'] if education != None else "未知")
    gender = bsObj.find("", {"checked": 'checked'})
    g = (gender.attrs['value'] if gender != None else "未知")
    agree = bsObj.find("", {"class": 'zm-profile-header-user-agree'}).find('strong')
    a = agree.get_text()
    thanks = bsObj.find("", {"class": 'zm-profile-header-user-thanks'}).find('strong')
    t = thanks.get_text()
    info = bsObj.find("div", {"class": 'profile-navbar clearfix'}).findAll('span')
    ask = info[1].get_text()
    ans = info[2].get_text()
    post = info[3].get_text()
    colle = info[4].get_text()
    log = info[5].get_text()
    filetxt = open('file.txt', 'w+')
    if n not in nameslist:
        nameslist.append(n)
        locationlist.append(l)
        educationlist.append(e)
        genderlist.append(g)
        agreelist.append(a)
        thankslist.append(t)
        askslist.append(ask)
        answerslist.append(ans)
        postslist.append(post)
        collectionslist.append(colle)
        logslist.append(log)
        filetxt.write(n + ',')
        filetxt.write(l + ',')
        filetxt.write(e + ',')
        filetxt.write(g + ',')
        filetxt.write(a + ',')
        filetxt.write(t + ',')
        filetxt.write(ask + ',')
        filetxt.write(ans + ',')
        filetxt.write(post + ',')
        filetxt.write(colle + ',')
        filetxt.write(log + '\n')
        print n, '|', l, '|', e, '|', g, '|', a, '|', t, '|', ask, '|', ans, '|', post, '|', colle, '|', log
    for user in users:
        if user.attrs['href'].startswith('/people/'):
            e = 'https://www.zhihu.com' + user.attrs['href']
            if e not in links:
                links.append(e)
    random.seed(datetime.datetime.now())
    randt = random.randint(0, len(links) - 1)
    print links[randt]
    UrlAddress = links[randt]

今天解决了搜索的用户重复出现的问题，并且获得了遍历用户的一系列个人信息，明天把这些信息采集下来可以先用R语言进行简单的分析，主要是熟悉R语言