from selenium import webdriver
#模拟鼠标操作
from selenium.webdriver import ActionChains
#键盘按键操作
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
#首页
driver.get('https://www.jd.com/')
#关键词
keyword = input('请输入你要查询的产品:')
#跳转地址
url = 'https://search.jd.com/Search?' + keyword
#time.sleep(1)
#定位到搜索框
search=driver.find_element_by_xpath('//*[@id="key"]')
#鼠标点击搜索
search.send_keys(keyword)
#time.sleep(1)
#按回车
search.send_keys(Keys.ENTER)
driver.get(url)
#print(driver.page_source)
#解析网页代码
soup = BeautifulSoup(driver.page_source,'lxml')
#print(soup.text)
data = soup.find_all('li',class_='gl-item')
#print(data)
time.sleep(2)
for data_s in data:
#名字
name = data_s.find('div',class_='p-name').get_text()
time.sleep(2)
print(name)
#店名
da = data_s.find('span',class_='J_im_icon').get_text()
time.sleep(2)
#print(da)
# 价格
price = data_s.find('div',class_='p-price').text.replace('¥', '').replace(',','')
time.sleep(2)
print(price)
打开selenium浏览器驱动进行抓取
1.实例化一个谷歌浏览器驱动
2.打开网址
3。关键字
4.分享网页url地址进行拼接
5.使用selenium进行搜索框定位
6.使用selenium库模拟用户键盘鼠标操作
7.解析网页使用selenium配置bs4进行解析
8.分析网页标签
9.进行数据提取
10.保存数据
运行结果
#函数代码
import selenium
import csv
import time
from bs4 import BeautifulSoup
#1.获取url
def fetch(url):
driver = webdriver.Chrome()
driver.get(url)
soup = BeautifulSoup(driver.page_source,'lxml')
parse(soup)
#解析
def parse(soup):
names = soup.find_all('div',{'class':'p-name'})
prcies = soup.find_all('span', {'class': 'J_im_icon'})
output(names,prcies)
#列表形式输出
def output(names, prices):
out = open('jingdong.csv','a', encoding='utf-8')
csv_write = csv.writer(out,dialect='excel')
for name,price in zip(names, prices):
print("商品名称:" + name.get_text().strip(), u"\n商品价格: " + price.get_text().strip())
text=[name.get_text().strip(),price.get_text().strip()]
csv_write.writerow(text)
print('------'*10)
if __name__ == '__main__':
keyword = input('请输入需要查找的商品:')
for i in range(3):
url = 'https://search.jd.com/Search?keyword=' + keyword + '&enc=utf-8&page=' + str(i * 2 + 1)
time.sleep(1)
fetch(url)
print('加载完成。')
和上面代码一样只是用函数封装了一下
1.解析
import selenium
import csv
import time
from bs4 import BeautifulSoup
#1.获取url
def fetch(url):
driver = webdriver.Chrome()
driver.get(url)
soup = BeautifulSoup(driver.page_source,'lxml')
parse(soup)
2.解析
def prase(soup):
names = soup.find_all('div', {'class': 'p-name p-name-type-2'})
prices = soup.find_all('div', {'class': 'p-price'})
output(names, prices)
#3.#列表形式输出
#列表形式输出
def output(names, prices):
out = open('jingdong.csv','a', encoding='utf-8')
csv_write = csv.writer(out,dialect='excel')
for name,price in zip(names, prices):
print("商品名称:" + name.get_text().strip(), u"\n商品价格: " + price.get_text().strip())
text=[name.get_text().strip(),price.get_text().strip()]
csv_write.writerow(text)
print('------'*10)
4.使用调用入口
if __name__ == '__main__':
keyword = input('请输入需要查找的商品:')
for i in range(3):
url = 'https://search.jd.com/Search?keyword=' + keyword + '&enc=utf-8&page=' + str(i * 2 + 1)
time.sleep(1)
fetch(url)
print('加载完成。')
总结
京东是动态页面加载异步渲染ajax请求普通的请求是解析不了网页获取不了信息的
所以我们这里用到selenium爬取
针对动态网页加载的框架
直接解析就好了很方便的缺点就是有点慢啊
必经是模拟浏览器进行操作的
下一篇介绍用fiddler进行抓包分析提取ajax里面的参数进行爬取