![13b3d0115c2f136dcf9c65daf91bcafe.png](https://i-blog.csdnimg.cn/blog_migrate/3366fc43cc8f95f1545adfdcce8cfff1.jpeg)
Python爬虫实战,python多线程抓取头像图片源码附exe程序及资源包
python多线程抓取头像图片源码附exe程序及资源包!
1.使用到的库requests、etree、re、os、ThreadPool
2.网页编码为utf-8需要转码:html.encoding=“utf-8”
3.使用xpath获取图片链接
4.使用了多线程
5.需要输入页面n,具体可以看动态图片
6.头像首页为栏目页,没有页面,这里用了if判断
7.py打包exe命令:pyinstaller -F 目录文件.py
![a7c96587ed24e311956314036b71402f.png](https://i-blog.csdnimg.cn/blog_migrate/4c942f19fc4e0b0bd2da01944404af20.jpeg)
![0c91b3fd3d8b51344754d4ca372a7258.gif](https://i-blog.csdnimg.cn/blog_migrate/8eb23270f5bf961c25dd7db4fc7b5fc4.gif)
![06e364c28877db60393045bcade93058.gif](https://i-blog.csdnimg.cn/blog_migrate/200d2455516cd55e7091b297b7589fd7.gif)
#by 微信:huguo00289
# -*- coding: utf-8 -*-
import requests
from lxml import etree
import re
import os
from multiprocessing.dummy import Pool as ThreadPool
def hqlj(n):
urls = []
for x in range(1,n+1):
url=f'https://www.woyaogexing.com/touxiang/index_{x}.html'
if x==1:
url='https://www.woyaogexing.com/touxiang/index.html'
print(url)
html=requests.get(url)
html.encoding="utf-8"
html=html.text
con=etree.HTML(html)
'''href=con.xpath('//div[@class="txList "]/a')
print(href)
for urls in href:
print(urls.attrib['href'])'''
href=con.xpath('//div[@class="txList "]/a/@href')
print(href)
for lj in href:
lj=f'https://www.woyaogexing.com{lj}'
print(lj)
urls.append(lj)
print(urls)
return urls
def hqtx(url):
#url="https://www.woyaogexing.com/touxiang/qinglv/2019/800160.html"
html=requests.get(url)
html.encoding="utf-8"
html=html.text
con=etree.HTML(html)
h1=con.xpath('//h1/text()')
h1=h1[0]
h1 = re.sub(r'[|/<>:*?"]', "_", h1) # 剔除不合法字符
print(h1)
os.makedirs(f'./touxiang/{h1}/',exist_ok=True)
imgs=con.xpath('//img[@class="lazy"]/@src')
print(imgs)
i=1
for img in imgs:
img_url=f'https:{img}'
if 'jpeg' in img_url:
img_name=img_url[-5:]
else:
img_name = img_url[-4:]
n=str(i)
img_name='%s%s'%(n,img_name)
print(img_name)
print(img_url)
r=requests.get(img_url)
with open(f'./touxiang/{h1}/{img_name}','ab+') as f:
f.write(r.content)
print(f"保存{img_name}图片成功!")
i=i+1
#hqlj("https://www.woyaogexing.com/touxiang/")
if __name__ == '__main__':
n=input("请输入要采集的页码数:",)
n=int(n)
urls=(hqlj(n))
try:
# 开4个 worker,没有参数时默认是 cpu 的核心数
pool = ThreadPool()
results = pool.map(hqtx, urls)
pool.close()
pool.join()
print("采集所有头像完成!")
except:
print("Error: unable to start thread")
![cd6dbbdbf160812cfb25ea2d567342d0.png](https://i-blog.csdnimg.cn/blog_migrate/0b35946a45be76520088d47ef593918d.jpeg)
最后附上exe打包程序,需要的可以试试!
链接: https://pan.baidu.com/s/12--cjhgy_emKhx5-pEg5sA 提取码: fuas
爬取了500页数据,分享给大家吧!总共1.71g!
链接:https://pan.baidu.com/s/1kS-wDMc9yqaRl1m2qxKfCA 提取码:trrz
看了下有部分数据编码好像有问题,大家凑合着用吧,不想改了!