'''
1. 分析网页结构,找到咱们所要爬取的网页url的共同点
2. 分析我们要爬取的页面, 发现页面中有图片url链接
3. 对页面进行请求
4. 拿到图片url
5. 下载图片
'''
import requests
from bs4 import BeautifulSoup
import os
urls = []
class Producer():
def __init__(self, query,num):
self.url = f"https://gz.17zwd.com/sks.htm?"
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.20 Safari/537.36"
}
self.query = query
self.num = num
self.param = {
"so": query,
"page": num
}
def collect(self):
'''
https://gz.17zwd.com/item/122040382
'''
resp = requests.get(url = self.url, headers = self.headers, params = self.param)
html = resp.text
soup = BeautifulSoup(html, "lxml")
resp.close()
divs = soup.find_all("div", {"class": "huohao-img-container"})
global urls
for div in divs:
item_id = div.a["href"].split("?")[0].split("/")[2]
title = div.a["title"]
dict = {title: "https://gz.17zwd.com/item/" + item_id}
urls.append(dict)
class Downloader():
def __init__(self):
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.20 Safari/537.36"
}
def down(self):
for url in urls:
title = list(url.keys())[0]
resp = requests.get(url = url[title], headers = self.headers)
soup = BeautifulSoup(resp.text, "lxml")
if not os.path.exists(title):
os.mkdir(title)
imgs = soup.select("div.index-root-18be6rFhXn87nebHWkNwG1 > div > img")
if len(imgs) == 0:
imgs = soup.select("div.index-root-18be6rFhXn87nebHWkNwG1 > div > div > img")
if len(imgs) == 0:
imgs = soup.select("div.index-root-18be6rFhXn87nebHWkNwG1 > div > div > div > img")
if len(imgs) == 0:
imgs = soup.select("div.index-root-18be6rFhXn87nebHWkNwG1 > div > p > img")
if len(imgs) == 0:
imgs = soup.select("div.index-root-18be6rFhXn87nebHWkNwG1 > p > img")
i = 1
for img in imgs:
resp = requests.get(url = img["src"], headers = self.headers)
with open(title + "/" + str(i) + ".jpg", "wb") as f:
f.write(resp.content)
i += 1
resp.close()
resp.close()
if __name__ == '__main__':
query = input("请输入衣服款式:")
num = input("请输入需要抓取的页码:")
producer = Producer(query, num)
producer.collect()
downloader = Downloader()
downloader.down()
女装商品图片爬取
最新推荐文章于 2024-07-30 20:28:19 发布