import time
from queue import Queue
from selenium import webdriver
class JdScrapyed():
'''
实现对京东网页的简单爬取
'''
def __init__(self,scrapy_name):
'''
实现广度优先搜索队列
:param scrapy_name: 想要抓取的内容
'''
self.scrapy_name = scrapy_name
self.crawl_queue = Queue()
self.crawl_queue.put('pesudo://next')
self.browser = webdriver.Firefox()
self.browser.get('https://www.jd.com/')
def run(self):
'''
:return:
'''
#搜索框和点击框
input_element = self.browser.find_element_by_css_selector('input#key')
input_element.send_keys(self.scrapy_name)
button_element = self.browser.find_element_by_css_selector('button.button')
button_element.click()
#显示睡眠(强制,还有个隐式睡眠什么什么wait的。。。没有用过,见谅),然后就是死循环获取元素信息
time.sleep(2)
while not self.crawl_queue.empty():
targets = self.browser.find_elements_by_class_name('gl-item')
time.sleep(1)
for target in targets:
target_name = target.find_element_by_css_selector('div.p-name em').text
target_price = target.find_element_by_css_selector('div.p-price i').text
print(target_name)
print(target_price)
#滑到底部点击下一页,并向队列推送使队列不为空实现循环爬取
self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
next_page=self.browser.find_element_by_css_selector('a.pn-next')
next_page.click()
self.crawl_queue.put('pesudo://next ')
print('正在抓取下一页')
print('*'*100)
jd_phone=JdScrapyed('手机')
jd_phone.run()
这样,只用改变参数就可以继续使用了,而且还可以做成多线程的爬虫,thread?也没用过。。。。。。等我查查,但是好像现在一般也用不到这个!