]】】;;;;
;】;;;;;;;;;;;
import requests
import re
import urllib.parse
import os
class TieBaSpider():
def __init__(self):
self.url = 'https://tieba.baidu.com/f?kw={}&pn={}'
self.name = 'tieba'
if not os.path.exists(self.name):
os.mkdir(self.name)
def send_request(self, full_url):
response = requests.get(full_url)
if response.status_code == 200:
content = response.content
return content
else:
print('出错乱了')
def parse_content(self, content):
content = content.decode('utf8')
pattern = re.compile('<a rel="noreferrer" href="(/p.*?)"', re.S)
hrefs = pattern.findall(content)
for href in hrefs:
detail = urllib.parse.urljoin(self.url, href)
content = self.send_request(detail)
self.parse_detail_content(content)
def parse_detail_content(self, content):
content = content.decode('utf8')
pattern = re.compile('<img\sclass="BDE Image"\ssrc="(.*?)"')
imgs = pattern.findall(content)
for img in imgs:
content = self.send_request(img)
self.save_content(content, img)
def save_content(self, content, img):
img_name = img[-15:]
print('正在保存%s' % img_name)
with open(self.name + '/' + img_name, 'wb') as f:
f.write(content)
def start(self):
kw = input('输入贴吧的名字')
page = int(input('请输人爬多少页'))
for i in range(1, page + 1):
pn = (i - 1) * 50
full_url = self.url.format(kw, pn)
content = self.send_request(full_url)
self.parse_content(content)
if name == ‘main’:
tbs = TieBaSpider()
tbs.start()···············