python爬取小草网站,根据关键字,下载所需图片至本地。根据关键字分类,命名。
- 先爬取所需栏目下全部标题,和对应的路径,根据所需关键字进行筛选。
- 满足筛选条件的,则根据对应的路径,去爬取路径下的图片。
def download_img(self, img_url, src, fileName):
try:
if not os.path.exists(src):
os.makedirs(src)
if not os.path.exists(src + fileName):
header = {'user-agent': 'Mozilla/5.0'}
r = requests.get(img_url, headers=header, stream=True)
if r.status_code == 200:
open(src + fileName, 'wb').write(r.content)
print(src + fileName + "下载成功")
self.info_list.append(src + fileName)
del r
except Exception:
print("下载出错:" + str(Exception))
print("url" + img_url + ",file:" + src + fileName)
def getHTMLText(self, url):
try:
kv = {'user-agent': 'Mozilla/5.0'}
r = requests.get(url, timeout=30, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
print(url + '页面获取出错:' + str(e))
return ''
def parseHTML(self, html):
ls = []
try:
soup = BeautifulSoup(html, 'html.parser')
alltr = soup.find_all('tr', 'tr3 t_one tac')
for tr in alltr:
h3 = tr.find('h3')
pageUrl = h3.find('a').get('href')
name = h3.string.strip()
if("P]" in name):
ls.append([name, pageUrl])
return ls
except Exception as e:
print('页面解析出错:' + str(e))
pass
def formatHTMLInfo(self, html, src):
try:
soup = BeautifulSoup(html, 'html.parser')
div = soup.find('div', 'tpc_content do_not_catch')
imgs = div.find_all('img')
i = 1
if not os.path.exists(src):
os.makedirs(src)
for img in imgs:
tail = os.path.splitext(img.get('ess-data'))[-1]
idx = ''
if i < 10:
idx += '00' + str(i)
elif i < 100:
idx += '0' + str(i)
else:
idx += str(i)
self.download_img(img.get('ess-data'), src + '\\', idx + tail)
i += 1
except Exception as e:
print('页面解析出错:' + str(e))
pass
depth = 100
dict = {}
dict['颜色'] = ['红','橙', '黄', '绿']
dict['学生'] = ['小学', '初中', '高中', '大学', '学生']
for i in range(depth):
url = basic_url + str(i+1)
html = Spyder.getHTMLText(url)
temp_list = Spyder.formatHTML(html)
prefix = 'https://分享你我光圈下的最美'
for page in temp_list:
for key in dict:
vList = dict[key]
for v in vList:
if v in page[0]:
flag = 2
while flag > 0:
InfoUrl = prefix + page[1]
print('当前页码:' + str(i) + ',分类:' + v + ',' + page[0] + InfoUrl)
htmlInfo = Spyder.getHTMLText(InfoUrl)
Spyder.formatHTMLInfo(htmlInfo, Spyder.src + key + '\\' + page[0])
time.sleep(3)
flag -= 1
- 存储获取到的页面信息至本地
def writeHTML(path, txt):
f = open(path, 'w+', encoding='utf-8')
f.write(txt)
f.close()
- 读取路径下的页面为文本 [一读一写,便于前期分析要爬取的目标页面]
def readHTML(path):
f = open(path, 'r', encoding='utf-8')
txt = f.read()
f.close()
return txt
- 将新增的文件存入数组,根据数组,讲新增的文件自动拷贝一份到目标路径。
- 好比每天爬取一次,之前爬到的东西存到了a,第二天再爬,本地有a就不下载了,但新增了b
- 这样本地会先下载下来b,然后将b的本地路径加入info_list
- 跑批完成后,会将info_list中的文件,拷贝一份到to_url的路径下
def bk_newImg(self):
for img_url in self.info_list:
to_url = self.newimg_src + img_url[len(self.src):]
to_parUrl = os.path.dirname(to_url)
if not os.path.exists(to_parUrl):
os.makedirs(to_parUrl)
shutil.copyfile(img_url,to_url)
print('新下载图片:' + to_url)
self.info_list.clear()