import re
import urllib2
import random
from bs4 import BeautifulSoup
link = []
nameslist = []
def print_name(UrlAddress):
html = urllib2.urlopen(UrlAddress)
bsObj = BeautifulSoup(html, 'html.parser')
names = bsObj.findAll("a", href=re.compile("^(/people/)"))
china_name = bsObj.find('title')
print china_name.get_text().split(' - ')[0]
for name in names:
e = name.attrs['href'].split('/')[2]
if e not in nameslist:
nameslist.append(e)
link.append('https://www.zhihu.com/people/' + e)
t = random.randint(0, len(link) - 1)
print link[t]
print_name(link[t])
print_name('https://www.zhihu.com/people/liu-hao-44-70')
解决了从一个链到另一个链不断攫取的过程,但重复的频繁次数过多,这是要解决的,可以在匹配的时候改变一下匹配的模式。
爬虫小试第二天
最新推荐文章于 2016-08-11 19:58:53 发布