# coding=utf-8
import sys
import urllib2
import datetime
import random
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
links = [] # 链接
nameslist = [] # 知乎姓名
locationlist = [] # 地址
educationlist = [] # 学校
genderlist = [] # 性别 男1 女0
agreelist = [] # 获得赞同数
thankslist = [] # 获得感谢数
askslist = [] # 提问数
answerslist = [] # 回答数
postslist = [] # 文章数
collectionslist = [] # 收藏数
logslist = [] # 公共编辑数
UrlAddress = 'https://www.zhihu.com/people/zi-you-96-25'
while True:
try:
html = urllib2.urlopen(UrlAddress)
except:
randt = random.randint(0, len(links) - 1)
UrlAddress = links[randt]
html = urllib2.urlopen(UrlAddress)
bsObj = BeautifulSoup(html, 'html.parser')
users = bsObj.findAll("", {"class": 'author-link'})
first_name = bsObj.find('title')
n = first_name.get_text().split(' - ')[0]
location = bsObj.find("", {"class": 'location item'})
l = (location.attrs['title'] if location != None else "未知")
education = bsObj.find("", {"class": 'education item'})
e = (education.attrs['title'] if education != None else "未知")
gender = bsObj.find("", {"checked": 'checked'})
g = (gender.attrs['value'] if gender != None else "未知")
agree = bsObj.find("", {"class": 'zm-profile-header-user-agree'}).find('strong')
a = agree.get_text()
thanks = bsObj.find("", {"class": 'zm-profile-header-user-thanks'}).find('strong')
t = thanks.get_text()
info = bsObj.find("div", {"class": 'profile-navbar clearfix'}).findAll('span')
ask = info[1].get_text()
ans = info[2].get_text()
post = info[3].get_text()
colle = info[4].get_text()
log = info[5].get_text()
filetxt = open('file.txt', 'w+')
if n not in nameslist:
nameslist.append(n)
locationlist.append(l)
educationlist.append(e)
genderlist.append(g)
agreelist.append(a)
thankslist.append(t)
askslist.append(ask)
answerslist.append(ans)
postslist.append(post)
collectionslist.append(colle)
logslist.append(log)
filetxt.write(n + ',')
filetxt.write(l + ',')
filetxt.write(e + ',')
filetxt.write(g + ',')
filetxt.write(a + ',')
filetxt.write(t + ',')
filetxt.write(ask + ',')
filetxt.write(ans + ',')
filetxt.write(post + ',')
filetxt.write(colle + ',')
filetxt.write(log + '\n')
print n, '|', l, '|', e, '|', g, '|', a, '|', t, '|', ask, '|', ans, '|', post, '|', colle, '|', log
for user in users:
if user.attrs['href'].startswith('/people/'):
e = 'https://www.zhihu.com' + user.attrs['href']
if e not in links:
links.append(e)
random.seed(datetime.datetime.now())
randt = random.randint(0, len(links) - 1)
print links[randt]
UrlAddress = links[randt]
今天解决了搜索的用户重复出现的问题,并且获得了遍历用户的一系列个人信息,明天把这些信息采集下来可以先用R语言进行简单的分析,主要是熟悉R语言