from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
from selenium.webdriver.common.keys import Keys
import time
from lxml import etree
from config import *
import pymongo
browse=webdriver.Chrome()
browse.maximize_window()
client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]
def search():
try :
browse.get("http://www.jd.com")
input=WebDriverWait(browse,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#key")))
submit = WebDriverWait(browse, 10).until(EC.element_to_be_clickable( (By.XPATH,'//*[@id="search"]/div/div[2]/button') ))
#op=browse.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button')
#op.click()
input.send_keys("美食")
submit.click()
#total=WebDriverWait(browse,20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > em:nth-child(1) > b")))
#return total.text
except TimeoutError:
return search()
def main():
search()
#total=search()
# total=int(re.compile("(\d+)").search(total).group(1))
for i in range(2,10):
print(i)
parse()
next_page(i)
def next_page(page_num):
try:
#有的元素在页面的不可见区域,这时候需要scrollIntoView()将其拖动到可见区域参数为true:
# 调用该函数,页面发送滚动,使element的顶部与视图(容器)顶部对齐
#参数为false:使element的底部与视图(容器)底部对齐
time.sleep(3)
input=WebDriverWait(browse,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input")))
submit=WebDriverWait(browse, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_bottomPage > span.p-skip > a") ))
#browse.save_screenshot('F:\demo\selem\screenshot.png')
target = browse.find_element_by_css_selector("#J_bottomPage > span.p-skip > input")
browse.execute_script("arguments[0].scrollIntoView(false);", target)
input.clear()
input.send_keys(page_num)
time.sleep(2)
submit.click()
#submit.send_keys(Keys.ARROW_RIGHT)
except Exception:
return next_page(page_num)
def parse():
#WebDriverWait(browse,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#J_goodsList")))
html=browse.page_source
htmls=etree.HTML(html).xpath('//*[@id="J_goodsList"]/ul/li')
for items in htmls:
item={}
item["价格"]=items.xpath("./div/div[3]/strong/i/text()")
item["名称"]=items.xpath("./div/div[4]/a/em/text()[1]")
item["商店"]=items.xpath("./div/div[7]/span/a/text()")
item["链接"]=items.xpath("./div/div[7]/span/a/@href")
#print(item)
save_tomongo(item)
def save_tomongo(result):
try:
if db[MONGO_TABLE].insert(result):
print("保存成功")
except Exception:
print("保存失败")
if __name__ == '__main__':
main()
python+selenium 爬取数据
最新推荐文章于 2024-05-22 11:59:29 发布