# CY3761 | 2021-12-19 12:10
import random
import time
from fake_useragent import UserAgent
from requests import get
from pyquery import PyQuery as pq
import hashlib
import os
# print(os.getcwd()) # 当前执行文件的目录
# 爬取百度贴吧
# 一般只能爬取一页, 第二页会触发 (百度安全验证)
class BaiduTiebaSpider:
def __init__(self): # 参数设置
# 关键字
# self.kw = input('请输入爬取的关键字:\n')
self.kw = '周玥'
# 链接
self.url = 'https://tieba.baidu.com/f'
# 头信息
self.headers = {'UserAgent': UserAgent().random}
# 页码
self.page = 1
# 文本
self.title = ''
self.dataDir = 'data/' + self.kw + '/'
if not os.path.exists(self.dataDir):
os.makedirs(self.dataDir)
pass
def send(self, url, params=None): # 发送请求 params=query
p = url.split('/')[-1]
if not p.isdigit():
p = str(self.page)
# 如果url了后面是数字
textPath = self.dataDir + p + '.html'
if not os.path.exists(textPath):
time.sleep(random.randint(1, 2)) # 每次爬取先睡眠 (避免被反爬检测)
isByte = False
if params is True:
params = None
isByte = True
resp = get(url, params, headers=self.headers)
if resp.status_code == 200 and resp.text:
# print(resp.url)
if isByte:
return resp.content
resp.encoding = 'utf-8'
with open(textPath, 'w', encoding='utf-8') as w:
w.write(resp.text)
return resp.text
else:
raise Exception('resp-error (%d) (%s)' % (resp.status_code, resp.url))
with open(textPath, 'r', encoding='utf-8') as r:
return r.read()
def parse(self, text): # 解析响应
# print(text)
# return
print('解析第%d页' % self.page, '响应长度 %d' % len(text), end=' ')
self.page += 1
r = pq(text)
# <a rel="noreferrer" href="/p/\d" title="*" target="_blank" class="j_th_tit ">*</a> | 找类型此的标签
aItems = r('a[rel="noreferrer"][target="_blank"][class="j_th_tit "]') # 这里class需要空一格
aItemsLen = 0
print('原链接数 %d' % len(aItems))
# print(self.url[:-2])
us = self.url[:-2]
for a in aItems:
_ = r(a)
a = _.attr('href')
t = _.attr('title')
u = us + a
if not u.split('/')[-1].isdigit():
continue
aItemsLen += 1
self.title = t
self.saveText(' '.join([u, t]))
# 进行请求详情页 并获取图片
# img.BDE_Image
imgItems = pq(self.send(u))('img.BDE_Image')
print('详情: %s 标题: %s 图片数: %d' % (u, t, len(imgItems)))
for img in imgItems:
self.saveByte(pq(img).attr('src'))
print('现链接数 %d' % aItemsLen)
# 进行下一页爬取
next = r('a[class="next pagination-item "]')
if next:
self.parse(self.send('https:' + next.attr('href')))
else:
print('爬虫结束..')
def saveText(self, text): # 保存数据 文本
with open(self.dataDir + self.kw + '.txt', 'a', encoding='utf-8') as w: # 追加形式
w.write(text + '\n')
def saveByte(self, url): # 保存数据 字节
# 文件判断是否存在
content = self.send(url, True)
content_md5 = hashlib.md5(content).hexdigest()
content_path = self.dataDir + self.title + '_' + content_md5 + '.' + url.split('.')[-1]
if not os.path.exists(content_path):
print('下载 %s' % content_path)
with open(content_path, 'wb') as w:
w.write(content)
else:
print('%s 已存在' % content_path)
def start(self): # 爬虫执行
print('爬虫开始...')
self.parse(self.send(self.url, dict(kw=self.kw)))
if __name__ == '__main__': # 以主函数形式运行
BaiduTiebaSpider().start() # 从内存占用角度来说, 此写法更优 (该执行完, 对象就准备释放内存空间)
"""
s = BaiduTiebaSpider() # 创建对象 并在内存中开空间
s.start() # 执行完并不会马上销毁
"""
# class 面向对象 类
# def 面向过程 函数
希望还能写一个