商品信息数据爬取下载
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from lxml import etree
import time
from pymongo import MongoClient
browser = webdriver.Chrome(r"D:/chromedriver_win32/chromedriver.exe")
wait = WebDriverWait(browser, 10)
KEYWORD = "ipad"
#搜索商品页需要登陆淘宝
def login_taobao():
print("be logining taobao website...")
try:
url = "https://s.taobao.com/search"
browser.get(url)
user_name = "淘宝账号"
password = "密码"
fm_login_name = browser.find_element_by_id("fm-login-id")
fm_login_password = browser.find_element_by_id("fm-login-password")
login_button = browser.find_element_by_class_name("fm-button")
fm_login_name.send_keys(user_name)
fm_login_password.send_keys(password)
login_button.click()
print("login successfull!")
except TimeoutException:
login_taobao()
#搜索商品
def search_commodity(name=KEYWORD):
try:
search_input = wait.until(EC.presence_of_element_located((By.ID, "q")))
search_button = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "submit")))
search_input.send_keys(name)
search_button.click()
print("searching commodity successfull!")
except TimeoutException:
print("failed to searching...")
#按页抓取
def index_page(page):
print("be crawling {} page".format(page))
#如果当前爬取第大于1页的数据则需要翻页操作
try:
if page > 1:
input_box = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "J_Input")))
submit_button = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "J_Submit")))
input_box.clear()
input_box.send_keys(page)
submit_button.click()
#等待两秒防止爬取过快被封
time.sleep(2)
#判断浏览器是否已加载出商品信息
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "J_MouserOnverReq ")))
#获取商品详细信息
get_products_info(page)
except TypeError:
#若加载超时则重新发起请求加载
index_page(page)
#解析详细信息
def get_products_info(page):
html_text = browser.page_source
html = etree.HTML(html_text)
h_items = html.xpath("//div[contains(@class, 'J_MouserOnverReq')]")
items = []
image_save_list = []
for h_item in h_items:
item = {}
img_list = h_item.xpath(".//img[@class='J_ItemPic img']/@data-src")
item["image"] = "https:" + img_list[0] if len(img_list) != 0 else None
price_list = h_item.xpath(".//div[@class='price g_price g_price-highlight']/strong/text()")
item["price"] = price_list[0] if len(price_list) != 0 else None
deal_list = h_item.xpath(".//div[@class='deal-cnt']/text()")
item["deal"] = deal_list[0] if len(deal_list) != 0 else None
title_list = h_item.xpath(".//div[@class='row row-2 title']//text()")
item["title"] = "".join(title_list).strip() if len(title_list) != 0 else None
shop_list = h_item.xpath(".//div[@class='shop']/a//text()")
item["shop"] = "".join(shop_list).strip() if len(shop_list) != 0 else None
location_list = h_item.xpath(".//div[@class='location']/text()")
item["location"] = location_list[0] if len(location_list) != 0 else None
items.append(item)
image_save_list.append((item["image"], item["title"]))
print("crawling {} page successfull".format(page))
save_data_into_mongo(items, page)
#将数据保存到本地mongo数据库
def save_data_into_mongo(items, page):
client = MongoClient(host="127.0.0.1", port=27017)
taobao_db = client["taobao_db"]
commodity_col = taobao_db["commodity_col"]
commodity_col.insert_many(items)
client.close()
print("{} page saved successfull!".format(page))
#主函数
def main():
login_taobao()
search_commodity()
#抓取10页
for i in range(10):
index_page(i+1)
browser.close()
if __name__ == '__main__':
main()
爬取结果如下:
图片单进程下载到本地
mport requests
from pymongo import MongoClient
#从数据库读取出图片url和商品标题
def read_data():
client = MongoClient()
db = client["taobao_db"]
col = db["commodity_col"]
data = col.find()
img_tit_list = [(d["image"], d["title"]) for d in data]
return img_tit_list
#下载单个图片
def main(url, title):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
file_name = title + ".jpg"
file_name = re.sub(r"/", "", file_name)
with open("./data/ipad商品图片/" + file_name, "wb") as f:
f.write(requests.get(url, headers=headers).content)
print(file_name + "下载成功!")
#循环下载全部图片
if __name__ == '__main__':
img_tit_list = read_data()
for i_t in img_tit_list:
main(i_t[0], i_t[1])
图片多进程下载到本地
import requests
from multiprocessing.pool import Pool
from pymongo import MongoClient
def read_data():
client = MongoClient()
db = client["taobao_db"]
col = db["commodity_col"]
data = col.find()
img_tit_list = [(d["image"], d["title"]) for d in data]
return img_tit_list
def main(img_tit):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
file_name = img_tit[1] + ".jpg"
file_name = re.sub(r"/", "", file_name)
with open("./data/ipad商品图片/" + file_name, "wb") as f:
f.write(requests.get(img_tit[0], headers=headers).content)
print(file_name + "下载成功!")
if __name__ == '__main__':
img_tit_list = read_data()
#创建进程池
pool = Pool(processes=5)
#将图片数据传给主函数并放入进程池运行
pool.map(main, img_tit_list)
pool.close()#封闭进程池 注意不是结束,是封闭进程池运行
pool.join() #使主进程阻塞,等待子进程运行结束