主要代码如下:
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 17 20:17:34 2018
利用selenium+正则表达式 模拟chrome浏览器爬淘宝美食信息
重点学习selenium库的用法和思想
利用PhantomJS 构造无界面浏览器, 使浏览器后台运行
Mongodb数据库存储数据, 包括用到pymongo库
@author: Administrator
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import re
from pyquery import PyQuery as pq
from config import * #配置文件 配置相关 关键字信息,如搜索关键字、数据库名称,当然也可直接加到 主代码中
print(MONGO_DB)
import pymongo
from pymongo import MongoClient
client = MongoClient(MONGO_URL) #数据库声明 连接
db = client[MONGO_DB] #
#from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
#browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) #无界面浏览器窗口
#browser.set_window_size(1400,900)
wait = WebDriverWait(browser, 10)
def search(Keyword):
print('正在搜索')
try:
browser.get("http://www.taobao.com")
inputinfo= wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))) #输入框
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, \
'#J_TSearchForm > div.search-button > button'))) #提交按键
inputinfo.send_keys(Keyword)
submit.click()
total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,\
'#mainsrp-pager > div > div > div > div.total')))
get_products()
return total.text
except TimeoutException:
return search()
def next_page(page_number):
print('正在翻页')
try:
inputinfo= wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form \
> input'))) #输入框
submit = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form >\
span.btn.J_Submit'))) #提交按键
inputinfo.clear()
inputinfo.send_keys(page_number)
submit.click()
wait.until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul >\
li.item.active > span'), str(page_number)))
get_products()
except TimeoutException:
return next_page(page_number)
def get_products():
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,\
'#mainsrp-itemlist .items .item')))
html = browser.page_source #获得网页原代码
doc = pq(html) #pyquery库,解析网页
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
product = {
'image': item.find('.pic .img').attr('src'), #属性attr
'price': item.find('.price').text()[2:],
'deal': item.find('.deal-cnt').text()[:-3],
'title': item.find('.title' ).text(),
'shop': item.find('.shop').text(),
'location': item.find('.location').text()
}
save_to_mongo(product)
print(product)
def save_to_mongo(result):
#==============================================================================
## 去掉try except结构,可用找出异常
# if db[MONGO_TABLE].insert(result):
# print('存储到MONGODB成功',result)
#
#==============================================================================
try:
if db[MONGO_TABLE].insert(result):
print('存储到MONGODB成功',result)
except Exception:
print('存储到MONGODB失败',result)
def main():
try:
total = search(Keyword)
#print(type(total))
total= int(re.compile('(\d+)').search(total).group(1))
print(total)
for i in range(3,total-93):
next_page(i)
finally:
browser.close()
if __name__ == '__main__':
main()
config.py配置文件 代码如下:
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 19 10:27:20 2018
config.py文件
@author: Administrator
"""
#数据库配置
#MONGO_URL = 'localhost'
#MONGO_URL = 'mongodb://mongodb0.example.net:27019'
MONGO_URL = ['localhost:27017'] #参照MongDB文档 API Documentation
MONGO_DB = 'taobao'
MONGO_TABLE = 'product'
Keyword = '美食'
#无界面浏览器配置
SERVICE_ARGS=['--load-images=false','--disk-cache=true']