爬虫 | 爬取百度贴吧 (仅学习用)

最新推荐文章于 2024-04-19 11:11:47 发布

CY3761

最新推荐文章于 2024-04-19 11:11:47 发布

阅读量507

点赞数 1

分类专栏： python爬虫答疑文章标签：爬虫百度 python

本文链接：https://blog.csdn.net/weixin_63272654/article/details/122028448

版权

python爬虫同时被 2 个专栏收录

23 篇文章 0 订阅

订阅专栏

答疑

3 篇文章 0 订阅

订阅专栏

# CY3761 | 2021-12-19 12:10
import random
import time

from fake_useragent import UserAgent
from requests import get
from pyquery import PyQuery as pq
import hashlib

import os


# print(os.getcwd())  # 当前执行文件的目录

# 爬取百度贴吧
# 一般只能爬取一页, 第二页会触发 (百度安全验证)

class BaiduTiebaSpider:
    def __init__(self):  # 参数设置
        # 关键字
        # self.kw = input('请输入爬取的关键字:\n')
        self.kw = '周玥'
        # 链接
        self.url = 'https://tieba.baidu.com/f'
        # 头信息
        self.headers = {'UserAgent': UserAgent().random}
        # 页码
        self.page = 1
        # 文本
        self.title = ''
        
        self.dataDir = 'data/' + self.kw + '/'
        
        if not os.path.exists(self.dataDir):
            os.makedirs(self.dataDir)
        
        pass
    
    def send(self, url, params=None):  # 发送请求 params=query
        p = url.split('/')[-1]
        
        if not p.isdigit():
            p = str(self.page)
        # 如果url了后面是数字
        textPath = self.dataDir + p + '.html'
        
        if not os.path.exists(textPath):
            time.sleep(random.randint(1, 2))  # 每次爬取先睡眠 (避免被反爬检测)
            
            isByte = False
            if params is True:
                params = None
                isByte = True
            
            resp = get(url, params, headers=self.headers)
            
            if resp.status_code == 200 and resp.text:
                # print(resp.url)
                if isByte:
                    return resp.content
                
                resp.encoding = 'utf-8'
                
                with open(textPath, 'w', encoding='utf-8') as w:
                    w.write(resp.text)
                
                return resp.text
            else:
                raise Exception('resp-error (%d) (%s)' % (resp.status_code, resp.url))
        
        with open(textPath, 'r', encoding='utf-8') as r:
            return r.read()
    
    def parse(self, text):  # 解析响应
        # print(text)
        
        # return
        print('解析第%d页' % self.page, '响应长度 %d' % len(text), end=' ')
        
        self.page += 1
        
        r = pq(text)
        # <a rel="noreferrer" href="/p/\d" title="*" target="_blank" class="j_th_tit ">*</a> | 找类型此的标签
        aItems = r('a[rel="noreferrer"][target="_blank"][class="j_th_tit "]')  # 这里class需要空一格
        aItemsLen = 0
        
        print('原链接数 %d' % len(aItems))
        
        # print(self.url[:-2])
        
        us = self.url[:-2]
        
        for a in aItems:
            _ = r(a)
            a = _.attr('href')
            t = _.attr('title')
            u = us + a
            
            if not u.split('/')[-1].isdigit():
                continue
            
            aItemsLen += 1
            
            self.title = t
            self.saveText(' '.join([u, t]))
            
            # 进行请求详情页 并获取图片
            # img.BDE_Image
            imgItems = pq(self.send(u))('img.BDE_Image')

            print('详情: %s 标题: %s 图片数: %d' % (u, t, len(imgItems)))
            
            for img in imgItems:
                self.saveByte(pq(img).attr('src'))
        
        print('现链接数 %d' % aItemsLen)
        
        # 进行下一页爬取
        next = r('a[class="next pagination-item "]')
        
        if next:
            self.parse(self.send('https:' + next.attr('href')))
        else:
            print('爬虫结束..')
    
    def saveText(self, text):  # 保存数据 文本
        with open(self.dataDir + self.kw + '.txt', 'a', encoding='utf-8') as w:  # 追加形式
            w.write(text + '\n')
    
    def saveByte(self, url):  # 保存数据 字节
        # 文件判断是否存在
        content = self.send(url, True)
        content_md5 = hashlib.md5(content).hexdigest()
        content_path = self.dataDir + self.title + '_' + content_md5 + '.' + url.split('.')[-1]
        
        if not os.path.exists(content_path):
            print('下载 %s' % content_path)
            with open(content_path, 'wb') as w:
                w.write(content)
        else:
            print('%s 已存在' % content_path)
    
    def start(self):  # 爬虫执行
        print('爬虫开始...')
        self.parse(self.send(self.url, dict(kw=self.kw)))


if __name__ == '__main__':  # 以主函数形式运行
    BaiduTiebaSpider().start()  # 从内存占用角度来说, 此写法更优 (该执行完, 对象就准备释放内存空间)
    
    """
    s = BaiduTiebaSpider()  # 创建对象 并在内存中开空间
    s.start()  # 执行完并不会马上销毁
    """
    
    # class 面向对象 类
    # def 面向过程 函数

在这里插入图片描述

希望还能写一个

CY3761

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
爬虫 | 爬取百度贴吧 (仅学习用)

# CY3761 | 2021-12-19 12:10import randomimport timefrom fake_useragent import UserAgentfrom requests import getfrom pyquery import PyQuery as pqimport hashlibimport os# print(os.getcwd()) # 当前执行文件的目录# 爬取百度贴吧# 一般只能爬取一页, 第二页会触发 (百度安全验证)clas
复制链接

扫一扫