今天做了暴走gif的爬取小项目@TOC
随着数据的增多下一不考虑增加数据库
但今天的项目代码还带进一步优化,不管怎样,每天一斤奶,强壮中国人。
python爬虫
import requests as re
from bs4 import BeautifulSoup as Btf
from PIL import Image
import pandas as pd
import re as r
import matplotlib.pyplot as plt
import time
def saveImage(urls,i):
j = 0
for url in urls:
print("正在保存第{}个".format(j))
pic = re.get(url,timeout=10)
time.sleep(1)
file_full_name = 'C:/Users/41318/Pictures/相册/'+ str(i) + '-' + str(j) +'.jpg'
with open(file_full_name,'wb') as f:
f.write(pic.content)
j+=1
def imgPrin():
img = Image.open(r'C:/Users/41318/Pictures/相册/DSC_4024.jpg')
print(img.size,img.format)
Image.open(r'C:/Users/41318/Pictures/相册/DSC_4024.jpg').save(r'C:/Users/41318/Pictures/相册/img2.jpg')
img2 = Image.open(r'C:/Users/41318/Pictures/相册/img2.jpg')
size = (288,180)
img2.thumbnail(size)
out = img2.rotate(45)
img.paste(out,(50,50))
#img.show()
def imgDowload(url):
data = re.get(url)
baozouimg = Btf(data.text,'lxml')
baozouimg = baozouimg.findAll('img',class_="waitpic")
baozouimg = str(baozouimg)
pattern = r.compile('data-original="(.*?)"')
#imgData = pd.DataFrame(baozouimg)
pattern2 = r.compile("src=(.*?)'")
baozouimgR = pattern.findall(baozouimg)
baozouimgR = str(baozouimgR)
baozouImg =pattern2.findall(baozouimgR)
return baozouImg
data = pd.DataFrame(baozouImg)
print(data)
for i in range(2,22):
url = 'https://baozougif.com/page_'+ str(i) +'.html'
imgPage = imgDowload(url)
saveImage(imgPage,i)
time.sleep(5)
##优化
通过连接数据库后,存储效率大大增加
代码如下: