'''
进入百度贴吧的主页,爬取各个主题的链接进入以后翻页爬取该主题下的所有用户的主页URL,进入用户的资料页
记录用户的吧龄,帖子数量,性别,粉丝数量关注者数量,关注者列表的页面和粉丝页面的连接。
该代码在win7,python2.7,64位.pycharm运行正常
'''
#coding:utf-8
import urllib2
import re
from bs4 import BeautifulSoup
from numpy import unique
import time
#从主页上提取用户的昵称,关注者数量,粉丝数量,粉丝连接,关注者的连接,吧年龄,帖子数量,性别
class userinfo(object):
def __init__(self,url):
self.url = url
self.username = None
self.concern_num = None
self.fans_num = None
self.age = None
self.tie = None
self.sex = None
self.concern_link = None
self.fans_link = None
def get_fans(self):
return self.fans_link
def get_concern(self):
return self.concern_link
def urlget(url):
user = userinfo(url)
prefix="http://tieba.baidu.com"
web_doc = urllib2.urlopen(url).read()
soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
username = soup.find_all('span',attrs={'class':'userinfo_username '})[0].get_text() #获取网页上的用户的名称
print "username=",username
user.username = username
concernlst = soup.find_all('span',attrs={'class':"concern_num"})
concern_link = concernlst[0].find_all('a',attrs={'href':True,'target':'_blank'})[0]['href']
concern_link = prefix+concern_link #关注连接
user.concern_link = concern_link
concern_num = concernlst[0].find_all('a',attrs={'href':True,'target':'_blank'})[0].get_text()
print "concern_num=",float(concern_num)
fans_link = concernlst[1].find_all('a',attrs={'href':True,'target':'_blank'})[0]['href']
fans_link = prefix + fans_link #粉丝连接
user.fans_link = fans_link
fans_num = concernlst[1].find_all('a',attrs={'href':True,'target':'_blank'})[0].get_text()
print "fans_num=",fans_num
user.fans_num = int(fans_num)
infor = soup.find_all('div',attrs={'class':'userinfo_userdata'})[0]
agetie = infor.find_all('span',attrs={'class':False})
print "age=",agetie[0].get_text()[3:-1] #第一个是吧年龄,第二个是帖子数量
user.age = float(agetie[0].get_text()[3:-1])
print "tie=",agetie[1].get_text()[3:]
user.tie = agetie[1].get_text()[3:]
p_sex = re.compile(r'userinfo_sex.*')
print infor.find_all('span',attrs={'class':p_sex})[0]
sexstr = infor.find_all('span',attrs={'class':p_sex})[0]['class'][1] #提取用户的性别
print "the sex of the user is : "
if "female" in sexstr:
print "female"
user.sex = "female"
elif "male" in sexstr:
print "male"
user.sex = "male"
else:
print "no sex"
user.sex = "no sex"
return user
'''提取关注者主页'''
def getconcern(url):
concern_lst = getfans(url) #输出关注者的页面连接
return concern_lst
'''提取粉丝主页'''
def getfans(url):
prefix="http://tieba.baidu.com"
print url
web_doc = urllib2.urlopen(url).read()
soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
span = soup.find_all('span',attrs={'class':'name'})
p_href = re.compile('/home/main.*');home_lst=[]
for s in span:
homelink = s.find_all('a',attrs={'href':p_href,'target':'_blank'})[0]['href']
print homelink
homelink = prefix + homelink
home_lst.append(homelink)
return home_lst
'''从当前的主题的连接中提取人员的主页连接,输入为一个url,返回当前用户主页连接集合,并且所有页的连接'''
#提取用户连接,再判断有多少页,确定页码数量。构建连接
def homeget(url):
web_doc = urllib2.urlopen(url).read()
time.sleep(1)
print "homeset,sleeping..."
soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
p_home = re.compile(r'/home/main?.*')
homenode = soup.find_all('a',attrs={'href':p_home,'target':'_blank','class':'p_author_face '}) #主页节点
prefix = "http://tieba.baidu.com/"
linklst = [prefix + home['href'] for home in homenode] #提取到用户主页的连接的列表
try:
locate = soup.find_all('li',attrs={'class':'l_pager pager_theme_5 pb_list_pager'})[0]
except:
print url
return unique(linklst),[]
alst = locate.find_all('a',attrs={"href":True})
if alst == []:
return unique(linklst),[]
else:
href = alst[len(alst)-1]['href']
pagenum = int(href.split('=')[1]) #确认数量
pagelst = []
prefix = "http://tieba.baidu.com/"
for i in range(2,pagenum+1):
page_link = prefix + href.split("=")[0] + "=" + str(i)
pagelst.append(page_link)
return unique(linklst),pagelst
'''给定连接集合,循环提取所有的用户主页连接,组成集合并返回一个集合'''
def pagesget(page_lst):
if page_lst == [] : return set()
prefix = "http://tieba.baidu.com/"
totalset=set()
for page in page_lst:
web_doc = urllib2.urlopen(page).read()
time.sleep(1)
print "pagesget,sleeping..."
soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
p_home = re.compile(r'/home/main?.*')
homenode = soup.find_all('a',attrs={'href':p_home,'target':'_blank'}) #主页节点,'class':'p_author_face '
linklst = [prefix + home['href'] for home in homenode] #提取到用户主页的连接的列表
totalset = totalset | set(linklst)
return totalset
'''给定贴吧,提取贴吧第一页的各个主题的连接'''
def topicenter(url="http://tieba.baidu.com/f?kw=%D6%D0%B9%FA%D2%A9%BF%C6%B4%F3%D1%A7"):
web_doc = urllib2.urlopen(url).read()
soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
p = re.compile(r'/p/\d{10}')
theme_url = soup.find_all('a',attrs={'href':p,'title':True,'target':'_blank','class':'j_th_tit'})
prefix = "http://tieba.baidu.com";url_lst=[]
for theme in set(theme_url): #主题连接补充完整
theme = prefix + theme['href']
url_lst.append(theme)
theme_page = soup.find_all('a',attrs={'class':' pagination-item '})
theme_page_links = [theme['href'] for theme in theme_page]
return url_lst,theme_page_links
'''提取主题列表第2页到第10的内容'''
def get_themes(theme_page_links):
if theme_page_links == []:return None
p = re.compile(r'/p/\d{10}');url_lst=[]
for theme in theme_page_links:
web_doc = urllib2.urlopen(theme).read()
print "sleeping......"
time.sleep(1)
soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
theme_url = soup.find_all('a',attrs={'href':p,'title':True,'target':'_blank','class':'j_th_tit'})
prefix = "http://tieba.baidu.com";
for theme in set(theme_url): #主题连接补充完整
theme = prefix + theme['href']
url_lst.append(theme)
return url_lst
#url = "http://tieba.baidu.com//home/main?un=Enven_lei&ie=utf-8&fr=pb&ie=utf-8"
#concern_link,fans_link = urlget(url)
#homelst = getconcern(concern_link)
#url="http://tieba.baidu.com/p/4559199887"
#homelst = homeget(url)
#print homelst
'''主题连接测试'''
#lst,next_page = topicenter()
#print lst
#print next_page
'''抓取页连接并返回当前页内容'''
#url = 'http://tieba.baidu.com/p/4570812012'
#s,p = homeget(url)
#print s
#print p
#pageset = pagesget(p)
#print totalset
themes,pages = topicenter()
otherthemes = get_themes(pages)
pages = set(pages) | set(otherthemes) #获得前十页的主题的连接
user_url = set()
for pg in pages:
curruser,pagetalk = homeget(pg) #主题下的第一页的用户的内容和剩余页的连接
nextuser = pagesget(pagetalk) #剩余页的用户主页连接
themeuser = set(curruser) | nextuser #这个主题下的所有用户的主页连接
user_url = user_url | themeuser #合并到所有的用户中去’
print pg
break
print "the number of active username in baidutieba is: ",len(user_url) #统计用户数量
for i in user_url:
user = urlget(i)
print user.get_fans()
print user.get_concern()
print user.url
print user.username