用 request 下载京东搜索商品页面源码后,发现得到的数据只有30条,怀疑京东搜索页面加载方式应该是动态渲染的,所以打算采用 Selenium 驱动谷歌浏览器来爬取搜索页面。
代码如下:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import time
import csv
class JDSelenium():
def __init__(self,keyword,page,timeout=10,service_args=['--load-images=false', '--disk-cache=true']):
self.keyword = keyword
self.timeout = timeout
self.page = page
self.chrome_options = webdriver.ChromeOptions()
self.chrome_options.add_argument('--headless')
self.browser = webdriver.Chrome(chrome_options=self.chrome_options,service_args=service_args)
self.browser.set_page_load_timeout(self.timeout)
# 可以选择加载浏览器
# self.browser = webdriver.Chrome()
self.wait = WebDriverWait(self.browser,self.timeout)
self.url = r'https://search.jd.com/Search?keyword={keyword}&enc=utf-8&page={page}'
self.file = open('{keyword}.csv'.format(keyword=self.keyword),'w',newline='')
self.write = self.create_writer()
self.count = 0
def close(self):
self.browser.close()
self.file.close()
def create_writer(self):
fieldnames = ['Title','Store','Price','Comments']
writer = csv.DictWriter(self.file,fieldnames=fieldnames)
writer.writeheader()
return writer
def process_request(self,page):
try:
self.browser.get(self.url.format(keyword=self.keyword,page=page))
self.wait.until(EC.presence_of_element_located((By.CLASS_NAME,"pn-next")))
self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(5)
return etree.HTML(self.browser.page_source)
#出现异常可以选择关闭,也可以选择继续执行
except Exception as e:
print(e)
# self.process_request(page)
self.close()
def process_item(self,response):
products = response.xpath('//*[@id="J_goodsList"]/ul//li[@class="gl-item"]')
for product in products:
item = {}
if '自营' in product.xpath('.//div[@class="p-icons"]//i//text()'):
#这里只爬取自营店的数据
#分析页面发现有些商品虽是自营店的,但没有出现店名,所以采用 join() 拼接列表,防止出现空列表而报错的情况
item['Store'] = ''.join(product.xpath('.//div[@class="p-shop"]/span/a/text()'))
item['Title'] = ''.join(product.xpath('.//div[contains(@class,"p-name")]/a/em//text()'))
item['Price'] = product.xpath('.//div[@class="p-price"]/strong/i/text()')[0]
item['Comments'] = product.xpath('.//div[@class="p-commit"]/strong/a/text()')[0]
self.count += 1
self.write.writerow(item)
else:
continue
def run(self):
for page in range(1,self.page+1):
print('------正在爬取第{page}页------'.format(page=page))
response = self.process_request(2*page-1)
self.process_item(response)
print('数据保存完成')
self.close()
print('共爬取到{count}条数据'.format(count=self.count))
if __name__ == '__main__':
jd = JDSelenium(keyword='电脑',page=1)
jd.run()
这里没有通过 selenium 获取需要的信息,而是等页面加载完成后通过网页源代码抽取需要的信息。其实本来是想写成 Scrapy+Selenium 的,但嫌弃 scrapy 框架太麻烦了,所以整合到一块了。
ps:由于请求链接后,需要等页面加载一部分然后滚动页面,最后再等待加载完成,耗费一点时间,所以爬取过程有点慢。*