百度贴吧爬虫---药科大学贴吧的用户信息爬取

最新推荐文章于 2023-10-28 13:38:56 发布

君溪竹

最新推荐文章于 2023-10-28 13:38:56 发布

阅读量1.1k

点赞数

本文链接：https://blog.csdn.net/qq_34139222/article/details/51524593

版权

'''

进入百度贴吧的主页，爬取各个主题的链接进入以后翻页爬取该主题下的所有用户的主页URL,进入用户的资料页

记录用户的吧龄，帖子数量，性别，粉丝数量关注者数量，关注者列表的页面和粉丝页面的连接。

该代码在win7,python2.7,64位.pycharm运行正常

'''

#coding:utf-8
import urllib2
import re
from bs4 import BeautifulSoup
from numpy import unique
import time
#从主页上提取用户的昵称，关注者数量，粉丝数量，粉丝连接，关注者的连接,吧年龄，帖子数量，性别

class userinfo(object):
	def __init__(self,url):
		self.url = url
		self.username = None
		self.concern_num = None
		self.fans_num = None
		self.age = None
		self.tie = None
		self.sex = None
		self.concern_link = None
		self.fans_link = None
	def get_fans(self):
		return self.fans_link
	def get_concern(self):
		return self.concern_link


def urlget(url):
	user = userinfo(url)
	prefix="http://tieba.baidu.com"
	web_doc = urllib2.urlopen(url).read()
	soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
	username = soup.find_all('span',attrs={'class':'userinfo_username '})[0].get_text()       #获取网页上的用户的名称
	print "username=",username
	user.username = username
	concernlst = soup.find_all('span',attrs={'class':"concern_num"})
	concern_link = concernlst[0].find_all('a',attrs={'href':True,'target':'_blank'})[0]['href']
	concern_link = prefix+concern_link     #关注连接
	user.concern_link = concern_link
	concern_num = concernlst[0].find_all('a',attrs={'href':True,'target':'_blank'})[0].get_text()
	print "concern_num=",float(concern_num)
	fans_link = concernlst[1].find_all('a',attrs={'href':True,'target':'_blank'})[0]['href']
	fans_link = prefix + fans_link            #粉丝连接
	user.fans_link = fans_link
	fans_num = concernlst[1].find_all('a',attrs={'href':True,'target':'_blank'})[0].get_text()
	print "fans_num=",fans_num
	user.fans_num = int(fans_num)
	infor = soup.find_all('div',attrs={'class':'userinfo_userdata'})[0]
	agetie = infor.find_all('span',attrs={'class':False})
	print "age=",agetie[0].get_text()[3:-1]       #第一个是吧年龄，第二个是帖子数量

	user.age = float(agetie[0].get_text()[3:-1])
	print "tie=",agetie[1].get_text()[3:]
	user.tie = agetie[1].get_text()[3:]
	p_sex = re.compile(r'userinfo_sex.*')
	print infor.find_all('span',attrs={'class':p_sex})[0]
	sexstr = infor.find_all('span',attrs={'class':p_sex})[0]['class'][1]  #提取用户的性别
	print "the sex of the user is : "
	if "female" in sexstr:
		print "female"
		user.sex = "female"
	elif "male" in sexstr:
		print "male"
		user.sex = "male"
	else:
		print "no sex"
		user.sex = "no sex"
	return user

'''提取关注者主页'''
def getconcern(url):
	concern_lst = getfans(url)          #输出关注者的页面连接
	return concern_lst


'''提取粉丝主页'''
def getfans(url):
	prefix="http://tieba.baidu.com"
	print url
	web_doc = urllib2.urlopen(url).read()
	soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
	span = soup.find_all('span',attrs={'class':'name'})
	p_href = re.compile('/home/main.*');home_lst=[]
	for s in span:
		homelink = s.find_all('a',attrs={'href':p_href,'target':'_blank'})[0]['href']
		print homelink
		homelink = prefix + homelink
		home_lst.append(homelink)
	return home_lst



'''从当前的主题的连接中提取人员的主页连接，输入为一个url，返回当前用户主页连接集合，并且所有页的连接'''
#提取用户连接，再判断有多少页，确定页码数量。构建连接
def homeget(url):
	web_doc = urllib2.urlopen(url).read()
	time.sleep(1)
	print "homeset,sleeping..."
	soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
	p_home = re.compile(r'/home/main?.*')
	homenode = soup.find_all('a',attrs={'href':p_home,'target':'_blank','class':'p_author_face '}) #主页节点
	prefix = "http://tieba.baidu.com/"
	linklst = [prefix + home['href'] for home in homenode]   #提取到用户主页的连接的列表
	try:
		locate = soup.find_all('li',attrs={'class':'l_pager pager_theme_5 pb_list_pager'})[0]
	except:
		print url
		return unique(linklst),[]
	alst = locate.find_all('a',attrs={"href":True})
	if alst == []:
		return unique(linklst),[]
	else:
		href = alst[len(alst)-1]['href']
		pagenum  = int(href.split('=')[1])  #确认数量
		pagelst = []
		prefix = "http://tieba.baidu.com/"
		for i in range(2,pagenum+1):
			page_link = prefix + href.split("=")[0] + "=" + str(i)
			pagelst.append(page_link)
		return unique(linklst),pagelst

'''给定连接集合，循环提取所有的用户主页连接，组成集合并返回一个集合'''
def pagesget(page_lst):
	if page_lst == [] : return set()
	prefix = "http://tieba.baidu.com/"
	totalset=set()
	for page in page_lst:
		web_doc = urllib2.urlopen(page).read()
		time.sleep(1)
		print "pagesget,sleeping..."
		soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
		p_home = re.compile(r'/home/main?.*')
		homenode = soup.find_all('a',attrs={'href':p_home,'target':'_blank'}) #主页节点,'class':'p_author_face '
		linklst = [prefix + home['href'] for home in homenode]   #提取到用户主页的连接的列表
	totalset = totalset | set(linklst)
	return totalset


'''给定贴吧，提取贴吧第一页的各个主题的连接'''
def topicenter(url="http://tieba.baidu.com/f?kw=%D6%D0%B9%FA%D2%A9%BF%C6%B4%F3%D1%A7"):
	web_doc = urllib2.urlopen(url).read()
	soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
	p = re.compile(r'/p/\d{10}')
	theme_url = soup.find_all('a',attrs={'href':p,'title':True,'target':'_blank','class':'j_th_tit'})
	prefix = "http://tieba.baidu.com";url_lst=[]
	for theme in set(theme_url):                                  #主题连接补充完整
		theme = prefix + theme['href']
		url_lst.append(theme)
	theme_page = soup.find_all('a',attrs={'class':' pagination-item '})
	theme_page_links = [theme['href'] for theme in theme_page]
	return url_lst,theme_page_links

'''提取主题列表第2页到第10的内容'''
def get_themes(theme_page_links):
	if theme_page_links == []:return None
	p = re.compile(r'/p/\d{10}');url_lst=[]
	for theme in theme_page_links:
		web_doc = urllib2.urlopen(theme).read()
		print "sleeping......"
		time.sleep(1)
		soup = BeautifulSoup(web_doc,"html.parser",from_encoding="utf-8")
		theme_url = soup.find_all('a',attrs={'href':p,'title':True,'target':'_blank','class':'j_th_tit'})
		prefix = "http://tieba.baidu.com";
		for theme in set(theme_url):                                  #主题连接补充完整
			theme = prefix + theme['href']
			url_lst.append(theme)
	return url_lst




#url = "http://tieba.baidu.com//home/main?un=Enven_lei&ie=utf-8&fr=pb&ie=utf-8"
#concern_link,fans_link = urlget(url)
#homelst = getconcern(concern_link)
#url="http://tieba.baidu.com/p/4559199887"
#homelst = homeget(url)
#print homelst
'''主题连接测试'''
#lst,next_page = topicenter()
#print lst
#print next_page

'''抓取页连接并返回当前页内容'''
#url = 'http://tieba.baidu.com/p/4570812012'
#s,p = homeget(url)
#print s
#print p
#pageset = pagesget(p)
#print totalset

themes,pages = topicenter()
otherthemes = get_themes(pages)
pages = set(pages) | set(otherthemes)    #获得前十页的主题的连接
user_url = set()
for pg in pages:
	curruser,pagetalk = homeget(pg)      #主题下的第一页的用户的内容和剩余页的连接
	nextuser = pagesget(pagetalk)        #剩余页的用户主页连接
	themeuser = set(curruser) | nextuser #这个主题下的所有用户的主页连接
	user_url = user_url | themeuser      #合并到所有的用户中去’
	print pg
	break

print "the number of active username in baidutieba is: ",len(user_url)                      #统计用户数量

for i in user_url:
	user = urlget(i)
	print user.get_fans()
	print user.get_concern()
	print user.url
	print user.username

君溪竹

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
百度贴吧爬虫---药科大学贴吧的用户信息爬取

'''进入百度贴吧的主页，爬取各个主题的链接进入以后翻页爬取该主题下的所有用户的主页URL,进入用户的资料页记录用户的吧龄，帖子数量，性别，粉丝数量关注者数量，关注者列表的页面和粉丝页面的连接。该代码在win7,python2.7,64位.pycharm运行正常'''#coding:utf-8import urllib2import refrom bs4 import Beautiful
复制链接

扫一扫