分享百度贴吧爬虫的Python代码

说在前面

之前分享了微博的爬虫代码,因为这些大网站的反爬机制相对比较严格,说不准啥时候又不能用了也是很常见的情况。所以细节上的东西需要各位自己来调整了。这次分享我写的爬取百度贴吧帖子内容的爬虫。

上代码

import requests
from bs4 import BeautifulSoup
import re
import pymysql
import time

phone_pat = re.compile(r'([^\d]|^)(1[345789]\d{9})([^\d]|$)')
qq_pat = re.compile(r'([^\d]|^)([1-9]\d{4,9})([^\d]|$)')
weixin_pat = re.compile(r'(微信|微|卫星|Vx|VX|vX|vx|xv|XV|V|v|联系我|加)[^a-zA-Z0-9]*?([a-zA-Z][a-zA-Z0-9_-]{5,19})')
#weixin_pat = re.compile(r'([^a-zA-Z0-9]|^)([a-zA-Z][a-zA-Z0-9_-]{5,19})([^a-zA-Z0-9]|$)')

lzl_pat = re.compile(r'回复.*:([^:]+)')

def sql_commit(sql):
    try:
        cursor.execute(sql)
        con.commit()
    except Exception as e:
        con.rollback()
        # print(e,sql)

def get_phone(text):
    res = phone_pat.search(text)
    if res:
        r = res.group(2)
        insert_sql1 = "insert into phone_list(`phone_no`,`come_from`,`url`) values ('{}','戒赌吧','{}')".format(r,insert_url)
        sql_commit(insert_sql1)

def get_qq(text):
    res = qq_pat.search(text)
    if res:
        r = res.group(2)
        insert_sql2 = "insert into qq_list(`qq`,`come_from`,`url`) values ('{}','戒赌吧','{}')".format(r,insert_url)
        sql_commit(insert_sql2)

def get_weixin(text):
    res = weixin_pat.search(text)
    if res:
        r = res.group(2)
        insert_sql3 = "insert into weixin_list(`weixin`,`come_from`,`url`) values ('{}','戒赌吧','{}')".format(r,insert_url)
        sql_commit(insert_sql3)

def get_pagenum(obj):
    try:
        page_obj = obj.find('li',class_ = 'l_reply_num')
        page_max = page_obj.contents[2].get_text()
    except:
        page_max = 0
    return int(page_max)

def get_huifu(obj):
    huifu_list = obj.find_all('div', class_="d_post_content j_d_post_content ")
    for h in huifu_list:
        h_text = h.get_text()
        get_phone(h_text)
        get_qq(h_text)
        get_weixin(h_text)

def retry_visit(web):
    retry_flag = 0
    success_flag = 0
    while success_flag == 0 and retry_flag < 10:
        try:
            response = s.get(web)
            success_flag = 1
        except:
            retry_flag += 1
            print('正在重试第%d次' % retry_flag)
        time.sleep(10)
    if retry_flag == 10:
        print('访问已被禁止,程序将报错退出')
    return response

def get_lzl(url):
    lzl = retry_visit(url)
    lzl = lzl.json()
    lzl2 = lzl['data']['comment_list']
    for l in lzl2:
        for ll in lzl2[l]['comment_info']:
            c = ll['content']
            res_lzl = lzl_pat.search(c)
            if res_lzl:
                lzl_text = res_lzl.group(1)
            else:
                lzl_text = c
            get_phone(lzl_text)
            get_qq(lzl_text)
            get_weixin(lzl_text)

if __name__ == '__main__':
    ua = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36'
    headers = {'User-Agent': ua}
    s = requests.Session()
    s.headers.update(headers)

    con = pymysql.connect(host="localhost", user="root", password="root", db="blacklist", charset="utf8mb4", port=3306)
    cursor = con.cursor()

    tieba_url = 'https://tieba.baidu.com'
    fid = '615140'
    page = 0
    while page < 100000:
        url = 'https://tieba.baidu.com/f?kw=%E6%88%92%E8%B5%8C&ie=utf-8&pn=' + str(page)
        html = retry_visit(url)
        #html.encoding = 'utf8mb4'
        obj = BeautifulSoup(html.content,'html.parser')
        tiezi_obj = obj.find('code',id="pagelet_html_frs-list/pagelet/thread_list")
        new_obj = BeautifulSoup(tiezi_obj.string,'lxml')
        tiezi_list = new_obj.find_all('div',class_ = 'threadlist_title pull_left j_th_tit ')
        for t in tiezi_list:
            tiezi_id = t.a.get('href')
            tid = tiezi_id[3:]
            tiezi_url = tieba_url + tiezi_id
            insert_url = tiezi_url
            tie_html = retry_visit(tiezi_url)
            tie_obj = BeautifulSoup(tie_html.content,'lxml')
            get_huifu(tie_obj)

            lzl_xhr = 'https://tieba.baidu.com/p/totalComment?tid={}&fid={}'.format(tid,fid)
            get_lzl(lzl_xhr)
            page_num = get_pagenum(tie_obj)
            if page_num > 1:
                for p in range(2,page_num + 1):
                    tiezi_nextpage_url = ''.join([tiezi_url,'?pn=',str(p)])
                    insert_url = tiezi_nextpage_url
                    tiezi_nextpage_html = retry_visit(tiezi_nextpage_url)
                    tiezi_nextpage_obj = BeautifulSoup(tiezi_nextpage_html.content,'lxml')
                    get_huifu(tiezi_nextpage_obj)

                    lzl_nextpage_url = 'https://tieba.baidu.com/p/totalComment?tid={}&fid={}&pn={}'.format(tid,fid,str(p))
                    get_lzl(lzl_nextpage_url)
        page += 50
        if page % 500 == 0:
            print('已抓取到%d页,pn码%d' % (page/50+1,page))
    cursor.close()
    con.close()

后话

因为我需要的是戒赌吧里面涉及的QQ、手机号、微信号,分别采用三个正则来提取。其中手机号的规则相对最明确,所以可信度最高,QQ号其次,微信号会采集到乱七八糟的内容,权当一个参考。
其实不单单爬取了回复贴的内容,还爬取了楼中楼里的所有内容,着实花了一点心思去读通贴吧的网页结构。值得一提的是贴吧在一定页数之后是禁止访问的,登陆了帐号之后又好像会放开这个限制。但是不管如何限制是肯定存在的,有意思的是你直接点击尾页大概率会卡住,不卡的话会跳出一个四字汉语选择的验证码,就算你输对了也没用,会无限循环跳出验证码,总之就是不会让你成功访问的。
爬取过程中,也经历了一次贴吧页面html的改版,导致我重写了一次(爬这些大网站的痛),所以读者看到此篇博客的时候也有可能已经不可用了233,祝好运。

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值