旧的方法在Windows下开发好跑了下没问题,打开MacBook Pro发现跑起来有问题,由古老的路径问题导致的,那就重写了下.
练习代码运用了os,re,requests,random,pathlib几个模块.
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
# 本爬虫工具实现了抓取https://tieba.baidu.com/p/6592791259页面中img标签下所有class="BDE_Image"的图片.
import os
import re
import requests
import random
from pathlib import Path
# 取指定页面内容
def getHTMLText(url):
# 定义一个HTTP头信息
headers = [{
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
}, {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}]
head = random.choice(headers) # 随机选一个HTTP头信息
try:
r = requests.request('GET', url, timeout=50, headers=head)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return 'error'
# 分析与保存指定class中的图片
# 这是利用os模块写的代码,在Windows下正常运行,在MacOS与Linux下会因为路径问题导致各种问题,虽然能用join拼接完成,但既然用python3.8开发了那就都用新模块,新特性写吧.
def saveImages_old(info):
path = os.path.abspath('.') # 当前目录的绝对路径与os.getcwd()效果一致
if os.path.exists(path + r'\images') == False: # 如果不存在images文件夹即创建
os.mkdir('images')
comp = re.compile(r'<img class="BDE_Image" src="(.+?\.jpg)"')
get_lists = re.findall(comp, info)
name = 1
for i in get_lists:
print('正在下载第:' + str(name) + '张图片')
r = requests.get(i)
with open(path + '\\images\\%s.jpg' % name, 'wb') as f:
f.write(r.content)
name += 1
# 分析与保存指定class中的图片
# 这是利用python3.4加入的pathlib模块写的代码,一套代码支持在Windows,MacOS,Linux的路径格式,并且字符串格式化也用到了python3.6新引入的一种新的字符串格式化方法.
def saveImages_new(info):
path = Path.cwd() # 当前目录的绝对路径D:\python\第一个爬虫
imagepath = path / 'images'
if imagepath.exists() == False: # 是否存在images目录,(path/'images').exists()也可执行
imagepath.mkdir(parents=False, exist_ok=True)
comp = re.compile(r'<img class="BDE_Image" src="(.+?\.jpg)"')
get_lists = re.findall(comp, info)
name = 1
for i in get_lists:
print('正在下载第:' + str(name) + '张图片')
r = requests.get(i)
a=imagepath / f'{name}.jpg'
a.write_bytes(r.content)
name += 1
if __name__ == "__main__":
result = getHTMLText('https://tieba.baidu.com/p/6592791259')
saveImages_new(result)