python + selenium多进程爬取淘宝搜索页数据
1. 功能描述
- 按照给定的关键词,在淘宝搜索对应的产品,然后爬取搜索结果中产品的信息,包括:标题,价格,销量,产地等信息,存入mongodb中,需要采用多进程提高爬取效率。
2. 环境
- 系统:win7
- MongoDB 3.4.6
- python 3.6.1
- IDE:pycharm
- 安装过chrome浏览器(63.0.3239.132 (正式版本) 32 位)
- selenium 3.7.0
- 配置好chromedriver v2.34
3. 代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import pymongo
import time
import datetime
import re
import multiprocessing
import lxml.html
import lxml.etree
keySearchWords = {
"动漫": [1, "动漫周边"],
"水果": [1, "水果沙拉"],
}
client = pymongo.MongoClient("127.0.0.1:27017")
db = client["taobao"]
db_coll = db["productInfo"]
retryMax = 8
chrome_options = webdriver.ChromeOptions()
def getProductMainInfo(htmlSource):
try:
resultTree = lxml.etree.HTML(htmlSource)
productLst = resultTree.xpath("//div[@class='m-itemlist']//div[contains(@class, 'J_MouserOnverReq')]")
print(f"productLst = {productLst}")
productInfoLst = []
for product in productLst:
productInfo = {
}
dataNid = product.xpath(".//div[contains(@class,'ctx-box')]//div[contains(@class, 'title')]/a/@data-nid")
if len(dataNid) > 0:
productInfo['dataNid'] = dataNid[0]
else:
productInfo['dataNid'] = 0
productInfo['_id'] = productInfo['dataNid']
taobaoCategory = product.xpath("@data-category")
if len(taobaoCategory) > 0:
productInfo['taobaoCategory'] = taobaoCategory[0]
else:
productInfo['taobaoCategory'] = 'unknow'
rank = product.xpath("@data-index")
if len(rank) > 0:
productInfo['rank'] = rank[0]
else:
productInfo['rank'] = 0
imgSrc = product.xpath(".//div[@class='pic']/a//img/@src")
if len(imgSrc) > 0:
productInfo['imgSrc'] = imgSrc[0]
else:
productInfo['imgSrc'] = ''
title = product.xpath(".//div[contains(@class,'ctx-box')]//div[contains(@class, 'title')]/a/text()")
productInfo['title'] = ''
if len(title) > 0:
for elem in title:
productInfo['title'] += elem.strip()
detailUrl = product.xpath(".//div[contains(@class,'title')]//a/@href"