京东商品爬虫
从https://list.jd.com/list.html?cat=670%2C671%2C673&page=1&s=57&click=0站点进行商品图片爬虫
分别爬取手机,pad,笔记本和台式机商品图片各1万张
#爬虫代码
import re
import requests
from multiprocessing.pool import Pool
from lxml import etree
import time
#爬取每页商品图片url
def crawl(url, page):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
}
text = requests.get(url, headers=headers).text
html = etree.HTML(text)
img_urls = html.xpath("//div[@class='gl-i-wrap']/div[@class='p-img']/a/img/@data-lazy-img")
img_urls = list(map(lambda url: "http:" + url, img_urls))
return img_urls
#下载图片到本地方法
def download_img_multipro(img_down_param):
file_path = "./data/com_img/" + str(img_down_param[0]) +".jpg"
with open(file_path, "wb") as f:
f.write(requests.get(img_down_param[1]).content)
print(file_path + "下载完成")
#主程序
if __name__ == '__main__':
n = 0
#循环抓取三百多页
for i in range(1, 316):
#构造url
url = "https://list.jd.com/list.html?cat=670%2C671%2C673&page={}&s=57&click=0".format(i)
#获取该页的商品url列表
img_urls = crawl(url, i)
#定义图片命名数字
img_count = len(img_urls) + n
img_name = [j for j in range(n, img_count)]
n = img_count
#构造下载图片的实参,存储路径和图片url组成的元组组成的列表
img_down_param = zip(img_name, img_urls)
#创建进程池
pool = Pool(processes=5)
#启动多进程下载
pool.