python—表情包爬取
闲着无聊时写了一个表情包爬取的简单脚本
运行效果如下
- 输入要爬取的页数以及要保存的目录地址
- 坐等表情包
- 运行后效果如下
代码如下
import requests
from bs4 import BeautifulSoup
import os
global headers
#请求头
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3754.400 QQBrowser/10.5.4020.400'}
#初始网页爬取
def get_page(url):
item_list = []
response = requests.get(url, headers=headers, timeout=(3, 3)).text
soup = BeautifulSoup(response, 'lxml')
soup.prettify()
data = soup.select('#home > div > div.col-sm-9.center-wrap > a')
for item in data:
item_list.append({'title' : item.select('.random_title')[0].get_text(), 'href' : item['href']})
return item_list
#图片链接获取
def get_href(item_list,dizhi):
for item in item_list:
img_list=[]
if not os.path.exists('{}/{}'.format(dizhi, item['title'])):
os.mkdir('{}/{}'.format(dizhi,item['title']))
address='{}/{}'.format(dizhi,item['title'])
print(address)
url=item['href']
response=requests.get(url,headers=headers, timeout=(3, 3)).text
soup=BeautifulSoup(response,'lxml')
soup.prettify()
imgs=soup.select('body > div.container_ > div.container > div > div.col-sm-9.center-wrap > li > div.pic-content > div > table > tbody > tr:nth-child(1) > td > a > img')
num = 1
for img in imgs:
if img['alt'] == '':
img['alt'] = str(num)
img_list.append({'name' : img['alt'], 'url' : img['src']})
num+=1
# print(img_list)
#download
for img in img_list:
try:
down_img(img['url'], address, img['name'])
except :
continue
#图片下载
def down_img(url,address,name):
img=requests.get(url, headers=headers, timeout=(3, 3)).content
dress=address+'/'+name+'.jpg'
print('正在下载...')
print(dress)
if not os.path.exists(dress):
with open(dress,'wb') as sub:
sub.write(img)
print('OK')
def main():
page=int(input('请输入要下载的页数 :'))
dizhi=input('请输入要下载的地址(C:/Users/KYA08/Desktop/表情包)可以桌面创建表情包文件夹,然后复制前面路径做简单修改:')
for page in range(1,page):
url='https://www.doutula.com/article/list/?page={}'.format(page)
# items = get_page(url)
# get_href(items, dizhi)
try:
items = get_page(url)
get_href(items, dizhi)
except Exception as e:
continue
if __name__ == '__main__':
main()
可自行打包
方式如下
- 进入此py文件所在的文件夹
- 在文件路径框内输入 cmd,回车即可进入当下目录的命令行窗口
3.进入命令行窗口后输入以下命令:
pyinstaller -F demo.py
注: demo,py替换为py文件名
4. 即可在当前文件夹下生成dist文件夹,里面即为打包后的可执行文件
说明
- 爬取网站为随机找的表情包网站,不含恶意
- 整体代码不够完善,一些空外目录未处理,速度未做提升,主要为基本操作代码,大佬勿喷,本人小白