python+selenium 爬取数据

from selenium import  webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import  re
from selenium.webdriver.common.keys import Keys
import  time
from lxml import etree
from config import *
import pymongo

browse=webdriver.Chrome()
browse.maximize_window()

client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]

def search():

    try :
        browse.get("http://www.jd.com")
        input=WebDriverWait(browse,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#key")))
        submit = WebDriverWait(browse, 10).until(EC.element_to_be_clickable(  (By.XPATH,'//*[@id="search"]/div/div[2]/button') ))
        #op=browse.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button')
        #op.click()
        input.send_keys("美食")
        submit.click()
        #total=WebDriverWait(browse,20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > em:nth-child(1) > b")))
        #return  total.text
    except TimeoutError:
        return search()


def main():
     search()
    #total=search()
   # total=int(re.compile("(\d+)").search(total).group(1))
     for i in range(2,10):
        print(i)
        parse()
        next_page(i)



def next_page(page_num):

    try:
        #有的元素在页面的不可见区域,这时候需要scrollIntoView()将其拖动到可见区域参数为true:
        # 调用该函数,页面发送滚动,使element的顶部与视图(容器)顶部对齐
        #参数为false:使element的底部与视图(容器)底部对齐


        time.sleep(3)


        input=WebDriverWait(browse,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input")))
        submit=WebDriverWait(browse, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_bottomPage > span.p-skip > a") ))
        #browse.save_screenshot('F:\demo\selem\screenshot.png')


        target = browse.find_element_by_css_selector("#J_bottomPage > span.p-skip > input")
        browse.execute_script("arguments[0].scrollIntoView(false);", target)

        input.clear()
        input.send_keys(page_num)
        time.sleep(2)
        submit.click()
        #submit.send_keys(Keys.ARROW_RIGHT)

    except  Exception:
        return  next_page(page_num)


def parse():
    #WebDriverWait(browse,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#J_goodsList")))
    html=browse.page_source
    htmls=etree.HTML(html).xpath('//*[@id="J_goodsList"]/ul/li')
    for items in htmls:
        item={}
        item["价格"]=items.xpath("./div/div[3]/strong/i/text()")
        item["名称"]=items.xpath("./div/div[4]/a/em/text()[1]")
        item["商店"]=items.xpath("./div/div[7]/span/a/text()")
        item["链接"]=items.xpath("./div/div[7]/span/a/@href")
        #print(item)
        save_tomongo(item)


def save_tomongo(result):
    try:
        if db[MONGO_TABLE].insert(result):
            print("保存成功")

    except Exception:
        print("保存失败")



if __name__ == '__main__':
    main()



  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值