说在前面
之前分享了微博的爬虫代码,因为这些大网站的反爬机制相对比较严格,说不准啥时候又不能用了也是很常见的情况。所以细节上的东西需要各位自己来调整了。这次分享我写的爬取百度贴吧帖子内容的爬虫。
上代码
import requests
from bs4 import BeautifulSoup
import re
import pymysql
import time
phone_pat = re.compile(r'([^\d]|^)(1[345789]\d{9})([^\d]|$)')
qq_pat = re.compile(r'([^\d]|^)([1-9]\d{4,9})([^\d]|$)')
weixin_pat = re.compile(r'(微信|微|卫星|Vx|VX|vX|vx|xv|XV|V|v|联系我|加)[^a-zA-Z0-9]*?([a-zA-Z][a-zA-Z0-9_-]{5,19})')
#weixin_pat = re.compile(r'([^a-zA-Z0-9]|^)([a-zA-Z][a-zA-Z0-9_-]{5,19})([^a-zA-Z0-9]|$)')
lzl_pat = re.compile(r'回复.*:([^:]+)')
def sql_commit(sql):
try:
cursor.execute(sql)
con.commit()
except Exception as e:
con.rollback()
# print(e,sql)
def get_phone(text):
res = phone_pat.search(text)
if res:
r = res.group(2)
insert_sql1 = "insert into phone_list(`phone_no`,`come_from`,`url`) values ('{}','戒赌吧','{}')".format(r,insert_url)
sql_commit(insert_sql1)
def get_qq(text):
res = qq_pat.search(text)
if res:
r = res.group(2)
insert_sql2 = "insert into qq_list(`qq`,`come_from`,`url`) values ('{}','戒赌吧','{}')".format(r,insert_url)
sql_commit(insert_sql2)
def get_weixin(text):
res = weixin_pat.search(text)
if res:
r = res.group(2)
insert_sql3 = "insert into weixin_list(`weixin`,`come_from`,`url`) values ('{}','戒赌吧','{}')".format(r,insert_url)
sql_commit(insert_sql3)
def get_pagenum(obj):
try:
page_obj = obj.find('li',class_ = 'l_reply_num')
page_max = page_obj.contents[2].get_text()
except:
page_max = 0
return int(page_max)
def get_huifu(obj):
huifu_list = obj.find_all('div', class_="d_post_content j_d_post_content ")
for h in huifu_list:
h_text = h.get_text()
get_phone(h_text)
get_qq(h_text)
get_weixin(h_text)
def retry_visit(web):
retry_flag = 0
success_flag = 0
while success_flag == 0 and retry_flag < 10:
try:
response = s.get(web)
success_flag = 1
except:
retry_flag += 1
print('正在重试第%d次' % retry_flag)
time.sleep(10)
if retry_flag == 10:
print('访问已被禁止,程序将报错退出')
return response
def get_lzl(url):
lzl = retry_visit(url)
lzl = lzl.json()
lzl2 = lzl['data']['comment_list']
for l in lzl2:
for ll in lzl2[l]['comment_info']:
c = ll['content']
res_lzl = lzl_pat.search(c)
if res_lzl:
lzl_text = res_lzl.group(1)
else:
lzl_text = c
get_phone(lzl_text)
get_qq(lzl_text)
get_weixin(lzl_text)
if __name__ == '__main__':
ua = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36'
headers = {'User-Agent': ua}
s = requests.Session()
s.headers.update(headers)
con = pymysql.connect(host="localhost", user="root", password="root", db="blacklist", charset="utf8mb4", port=3306)
cursor = con.cursor()
tieba_url = 'https://tieba.baidu.com'
fid = '615140'
page = 0
while page < 100000:
url = 'https://tieba.baidu.com/f?kw=%E6%88%92%E8%B5%8C&ie=utf-8&pn=' + str(page)
html = retry_visit(url)
#html.encoding = 'utf8mb4'
obj = BeautifulSoup(html.content,'html.parser')
tiezi_obj = obj.find('code',id="pagelet_html_frs-list/pagelet/thread_list")
new_obj = BeautifulSoup(tiezi_obj.string,'lxml')
tiezi_list = new_obj.find_all('div',class_ = 'threadlist_title pull_left j_th_tit ')
for t in tiezi_list:
tiezi_id = t.a.get('href')
tid = tiezi_id[3:]
tiezi_url = tieba_url + tiezi_id
insert_url = tiezi_url
tie_html = retry_visit(tiezi_url)
tie_obj = BeautifulSoup(tie_html.content,'lxml')
get_huifu(tie_obj)
lzl_xhr = 'https://tieba.baidu.com/p/totalComment?tid={}&fid={}'.format(tid,fid)
get_lzl(lzl_xhr)
page_num = get_pagenum(tie_obj)
if page_num > 1:
for p in range(2,page_num + 1):
tiezi_nextpage_url = ''.join([tiezi_url,'?pn=',str(p)])
insert_url = tiezi_nextpage_url
tiezi_nextpage_html = retry_visit(tiezi_nextpage_url)
tiezi_nextpage_obj = BeautifulSoup(tiezi_nextpage_html.content,'lxml')
get_huifu(tiezi_nextpage_obj)
lzl_nextpage_url = 'https://tieba.baidu.com/p/totalComment?tid={}&fid={}&pn={}'.format(tid,fid,str(p))
get_lzl(lzl_nextpage_url)
page += 50
if page % 500 == 0:
print('已抓取到%d页,pn码%d' % (page/50+1,page))
cursor.close()
con.close()
后话
因为我需要的是戒赌吧里面涉及的QQ、手机号、微信号,分别采用三个正则来提取。其中手机号的规则相对最明确,所以可信度最高,QQ号其次,微信号会采集到乱七八糟的内容,权当一个参考。
其实不单单爬取了回复贴的内容,还爬取了楼中楼里的所有内容,着实花了一点心思去读通贴吧的网页结构。值得一提的是贴吧在一定页数之后是禁止访问的,登陆了帐号之后又好像会放开这个限制。但是不管如何限制是肯定存在的,有意思的是你直接点击尾页大概率会卡住,不卡的话会跳出一个四字汉语选择的验证码,就算你输对了也没用,会无限循环跳出验证码,总之就是不会让你成功访问的。
爬取过程中,也经历了一次贴吧页面html的改版,导致我重写了一次(爬这些大网站的痛),所以读者看到此篇博客的时候也有可能已经不可用了233,祝好运。