这是个单线程:获得图片
import os
import requests
from lxml import etree
from lxml import html
from html.parser import HTMLParser
import re
count = 0
wenjian = input("你的照片将要储存到......文件夹:")
img_path = f"./{wenjian}/" # 指定保存地址
if not os.path.exists(img_path):
print("您没有这个文件为您新建一个文件:")
os.mkdir(img_path)
else:
for i in range(1,5,1):
if i==1:
url = "https://sc.chinaz.com/tupian/nvshengtupian.html"
else:
url = f"https://sc.chinaz.com/tupian/nvshengtupian_{i}.html"
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
}
response = requests.get(url,headers=headers)
response.encoding= "utf-8"
response = response.text
img_html = re.findall('data-original="(.*?)"',response)
for img in img_html:
img = 'https:'+img
count += 1
myimg = requests.get(img)
file_name = f'{img_path}图片{str(count)}.jpg'
# 图片和音乐WB的二进制写入方式
f = open(file_name, "wb")
f.write(myimg.content)
print("正在保存" + str(count) + " 张图片")
不用说速度超级慢。
所以采用多线程。
import queue
import re
import time
import random
import threading
import requests
from bs4 import BeautifulSoup
import queue
def get_main(url):
img_html2=[]
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
}
response = requests.get(url,headers=headers)
response.encoding= "utf-8"
response = response.text
img_html = re.findall('data-original="(.*?)"',response)
for img in img_html:
img = 'https:' + img
img_html2.append(img)
img_name = re.findall('alt="(.*?)"',response)
dict1 = dict(zip(img_html2,img_name))
resultList =list(dict1.items())
return resultList
def do_craw(url_queue:queue.Queue,fout):
while True:
url = url_queue.get()
list3 = get_main(url)
for list in list3:
fout.write(str(list)+"\n")
time.sleep(random.randint(1, 2))
if __name__=="__main__":
urls = {
f"https://sc.chinaz.com/tupian/nvshengtupian_{i}.html"
for i in range(2,16)
}
url_queue = queue.Queue()
for url in urls:
url_queue.put(url)
fout = open("02.zhanzhang.txt", "w")
for away in range(14):
t=threading.Thread(target=do_craw,args=(url_queue,fout,))
t.start()
for away in range(14):
t=threading.Thread(target=do_craw,args=(url_queue,fout,))
t.start()
这个的意思是我开了14个线程,因为我只有14张要保存的东西吗嘛,所以14个就可以ok。
要是想要更快,在获取网址上面加入线程。
这这个是例子,要看上一篇我写的,就会对线程有初步了解。我写了几个例子加深理解。
最重要的是使用queue模块,使各个线程共享数据。
如果想要下载图片,在加一个线程,用来get图片的网址。然后下载。
一定得到的是jpg网址哦!!!