Day03 爬取京东商品信息+元素交互操作+BeautifulSoup4

一、 先在京东搜索墨菲定律,然后对页面上的商信息进行爬取:

 1 from selenium import webdriver
 2 import time
 3 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 4 
 5 driver = webdriver.Chrome()
 6 num = 1
 7 try:
 8     driver.implicitly_wait(10)
 9     # 往京东发送请求
10     driver.get('https://www.jd.com/')
11     #往京东主页输入框输入墨菲定律按回车键
12     input_tag = driver.find_element_by_id('key')
13     input_tag.send_keys('墨菲定律')
14     input_tag.send_keys(Keys.ENTER)
15 
16     time.sleep(5)
17 
18     good_list = driver.find_elements_by_class_name('gl-item')
19     for good in good_list:
20         # print(good)
21 
22         good_name = good.find_element_by_css_selector('.p-name em').text
23         # print(good_name)
24 
25         #商品链接
26         good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href')
27         # print(good_url)
28 
29         #商品价格
30         good_price = good.find_element_by_class_name('p-price').text
31         # print(good_price)
32 
33         #商品评价
34         good_commit = good.find_element_by_class_name('p-commit').text
35         # print(good_commit)
36 
37         good_content = f'''
38         num:{num}
39         商品名称:{good_name}
40         商品连接:{good_url}
41         商品价格:{good_price}
42         商品评价:{good_commit}
43         \n
44         '''
45         print(good_content)
46         with open('jd.txt','a',encoding='utf-8') as f:
47             f.write(good_content)
48         num+=1
49 
50     #找到下一页并点击
51     next_tag = driver.find_element_by_class_name('pn-next')
52     next_tag.click()
53 
54     time.sleep(10)
55 
56 finally:
57     driver.close()

然后我们对上面代码进行升级,使其能够对页面下拉,下一页从而爬取更多的商品信息:

将爬取信息的步骤写成一个递归函数进行调用

 1 from selenium import webdriver
 2 import time
 3 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 4 
 5 driver = webdriver.Chrome()
 6 
 7 def get_good(driver):
 8     num = 1
 9     try:
10         time.sleep(5)
11 
12         # 下拉滑动5000px
13         js_code = '''
14                window.scrollTo(0,5000)
15            '''
16         driver.execute_script(js_code)
17         # 等待5秒等待商品加载
18         time.sleep(5)
19 
20         good_list = driver.find_elements_by_class_name('gl-item')
21         for good in good_list:
22 
23             #商品名称
24             good_name = good.find_element_by_css_selector('.p-name em').text
25 
26             # 商品链接
27             good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href')
28 
29             # 商品价格
30             good_price = good.find_element_by_class_name('p-price').text
31 
32             # 商品评价
33             good_commit = good.find_element_by_class_name('p-commit').text
34 
35             good_content = f'''
36             num:{num}
37             商品名称:{good_name}
38             商品连接:{good_url}
39             商品价格:{good_price}
40             商品评价:{good_commit}
41             \n
42             '''
43             print(good_content)
44             with open('jd.txt', 'a', encoding='utf-8') as f:
45                 f.write(good_content)
46             num += 1
47 
48         # 找到下一页并点击
49         next_tag = driver.find_element_by_class_name('pn-next')
50         next_tag.click()
51 
52         time.sleep(5)
53         #递归调用函数本身
54         get_good(driver)
55 
56     finally:
57         driver.close()
58 
59 if __name__ == '__main__':
60     driver = webdriver.Chrome()
61 
62     try:
63         driver.implicitly_wait(10)
64         # 往京东发送请求
65         driver.get('https://www.jd.com/')
66         # 往京东主页输入框输入墨菲定律按回车键
67         input_tag = driver.find_element_by_id('key')
68         input_tag.send_keys('墨菲定律')
69         input_tag.send_keys(Keys.ENTER)
70 
71         #调用获取商品信息函数
72         get_good(driver)
73     finally:
74         driver.close()

二、元素交互操作

 1.在京东先搜索‘围城’,在清空输入栏,输入'墨菲定律'进行搜索

 1 from selenium import webdriver
 2 from selenium.webdriver import ActionChains
 3 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 4 import time
 5 
 6 
 7 driver = webdriver.Chrome()
 8 
 9 try:
10     driver.implicitly_wait(10)
11     driver.get('https://www.jd.com/')
12     time.sleep(5)
13     #点击清除
14     input = driver.find_element_by_id('key')
15     input.send_keys('围城')
16 
17     #通过class查找搜索按钮
18     search = driver.find_element_by_class_name('button')
19     search.click()  #点击按钮搜索
20 
21     time.sleep(3)
22 
23     input2 = driver.find_element_by_id('key')
24     input2.clear()   #清空输入框
25 
26     time.sleep(1)
27 
28     input2.send_keys('墨菲定律')
29     input2.send_keys(Keys.ENTER)
30 
31     time.sleep(10)
32 
33 finally:
34     driver.close()
View Code

 2.将目标方块移动到目标方块中

 1 from selenium import webdriver
 2 from selenium.webdriver import ActionChains
 3 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 4 import time
 5 driver = webdriver.Chrome()
 6 
 7 try:
 8     driver.implicitly_wait(10)
 9     driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
10     time.sleep(5)
11 
12     #遗弃方法
13     # driver.switch_to_frame()
14     #新方法
15     driver.switch_to.frame('iframeResult')
16     time.sleep(1)
17 
18     #获取动作链对象
19     action = ActionChains(driver)
20     #启示方块id:draggable
21     source = driver.find_element_by_id('draggable')
22 
23     #目标方块id:droppable
24     target = driver.find_element_by_id('droggable')
25 
26     #方式一秒移
27     #起始方块瞬间移动到目标方块中
28     #拟定好一个动作,需要执行的方法perform
29     action.drag_and_drop(source,target).perform()
30 
31     time.sleep(10)
32 finally:
33     driver.close()
View Code

 3.将目标方块一步一步的移动到目标方块中

 1 from selenium import webdriver
 2 from selenium.webdriver import ActionChains
 3 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 4 import time
 5 
 6 driver = webdriver.Chrome()
 7 
 8 try:
 9     driver.implicitly_wait(10)
10     driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
11     time.sleep(5)
12 
13     #遗弃方法
14     # driver.switch_to_frame()
15     #新方法
16     driver.switch_to.frame('iframeResult')
17     time.sleep(1)
18 
19     #启示方块id:draggable
20     source = driver.find_element_by_id('draggable')
21 
22     #目标方块id:droppable
23     target = driver.find_element_by_id('droppable')
24 
25     print(source.size) #大小
26     print(source.text)  #文本
27     print(source.tag_name)  #标签名
28     print(source.location)  #坐标
29 
30     #找到滑动距离
31     distance = target.location['x']-source.location['x']
32 
33     #按住起始滑块
34     ActionChains(driver).click_and_hold(source).perform()
35     #方式二一点一点移
36     s=0
37     while s < distance:
38         #获取动作链对象
39         #每次移动距离
40         ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
41         s+=2
42         time.sleep(0.1)
43 
44     #松开起始滑块
45     ActionChains(driver).release().perform()
46     time.sleep(10)
47 finally:
48     driver.close()
View Code

 4.目标网页执行js代码

 1 from selenium import webdriver
 2 from selenium.webdriver import ActionChains
 3 import time
 4 driver = webdriver.Chrome()
 5 try:
 6     driver.implicitly_wait(10)
 7 
 8     driver.get('https://www.baidu.com/')
 9     driver.execute_script(
10         'alert("你好呀!!!")'
11     )
12     time.sleep(10)
13 finally:
14     driver.close()
View Code

 5.模拟浏览器的前进后退

 1 from selenium import webdriver
 2 import time
 3 driver = webdriver.Chrome()
 4 try:
 5     driver.implicitly_wait(10)
 6 
 7     driver.get('https://www.baidu.com/')
 8     driver.get('https://www.taobao.com/')
 9     driver.get('https://www.sina.com.cn/')
10     #回退
11     driver.back()
12     time.sleep(5)
13     #前进
14     driver.forward()
15     time.sleep(3)
16 finally:
17     driver.close()
View Code

 

三、BeautifulSoup4

 1.bs4的安装与使用

 1 '''
 2 安装解析器:
 3     pip3 install lxml
 4 安装解析库:
 5     pip3 install bs4
 6 
 7 '''
 8 html_doc ="""
 9 <html><head><title>The Dormouse's story</title></head>
10 <body>
11 <p class="sister"><b>$37</b></p>
12 
13 <p class="story" id="p">Once upon a time there were three little sisters; and their names were
14 <a href="http://example.com/elsie" class="sister" >Elsie</a>,
15 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
16 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
17 and they lived at the bottom of a well.</p>
18 <p class="story">...</p>
19 """
20 
21 from bs4 import BeautifulSoup
22 
23 #python自带的解析库
24 # soup = BeautifulSoup(html_doc,'html.parser')
25 
26 #调用bs4得到一个soup对象
27 soup = BeautifulSoup(html_doc,'lxml')
28 
29 #打印bs4对象
30 # print(soup)
31 #打印bs4类型
32 # print(type(soup))
33 
34 #美化功能
35 html=soup.prettify()
36 print(html)
View Code

 2.bs4之遍历文档树

 1 '''
 2 安装解析器:
 3     pip3 install lxml
 4 安装解析库:
 5     pip3 install bs4
 6 
 7 '''
 8 html_doc ="""
 9 <html><head><title>The Dormouse's story</title></head>
10 <body>
11 <p class="sister"><b>$37</b></p>
12 
13 <p class="story" id="p">Once upon a time there were three little sisters; and their names were
14 <a href="http://example.com/elsie" class="sister" >Elsie</a>,
15 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
16 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
17 and they lived at the bottom of a well.</p>
18 <p class="story">...</p>
19 """
20 
21 from bs4 import BeautifulSoup
22 
23 soup = BeautifulSoup(html_doc,'lxml')
24 
25 #遍历文档树
26 # 1、直接使用 *****
27 print(soup.html)
28 print(type(soup.html))
29 print(soup.a)
30 print(soup.p)
31 
32 # 2、获取标签的名称
33 print(soup.a.name)
34 
35 # 3、获取标签的属性    *****
36 print(soup.a.attrs)   #获取a标签内的所有属性
37 print(soup.a.attrs['href'])
38 
39 # 4、获取标签文本的内容   *****
40 print(soup.p.text)  # $37
41 
42 # 5、嵌套选择
43 print(soup.html.body.p)
44 
45 # 6、子节点、子孙节点
46 print(soup.p.children)   #返回迭代器对象
47 print(list(soup.p.children))  #[<b>$37</b>]
48 
49 # 7、父节点、祖先节点
50 print(soup.b.parent)
51 print(soup.b.parents)
52 print(list(soup.b.parents))
53 
54 
55 # 8、兄弟节点  (sibling: 兄弟姐妹)
56 print(soup.a)
57 # 获取下一个兄弟节点
58 print(soup.a.next_sibling)
59 # 获取下一个的所有兄弟节点,返回的是一个生成器
60 print(soup.a.next_siblings)
61 print(list(soup.a.next_siblings))
62 #
63 # 获取上一个兄弟节点
64 print(soup.a.previous_sibling)
65 # 获取上一个的所有兄弟节点,返回的是一个生成器
66 print(list(soup.a.previous_siblings))
View Code

 3.bs4之遍历搜索树

  1 '''
  2 find:找第一个
  3 find_all:找所有
  4 标签查找与属性查找:
  5     name属性
  6             name 标签名
  7             attrs 属性查找匹配
  8             text 文本匹配
  9 
 10     标签:
 11         - 字符串过滤器   字符串全局匹配
 12 
 13         - 正则过滤器
 14             re模块匹配
 15 
 16         - 列表过滤器
 17             列表内的数据匹配
 18 
 19         - bool过滤器
 20             True匹配
 21 
 22         - 方法过滤器
 23             用于一些要的属性以及不需要的属性查找。
 24 
 25     属性:
 26         - class_
 27         - id
 28 '''
 29 
 30 
 31 html_doc = """
 32 <html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" >Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>and they lived at the bottom of a well.</p><p class="story">...</p>
 33 """
 34 
 35 from bs4 import  BeautifulSoup
 36 
 37 soup = BeautifulSoup(html_doc,'lxml')
 38 
 39 #name 标签名
 40 # attrs 属性查找匹配
 41 # text 文本匹配
 42 #find与find_all搜索文档
 43 
 44 '''
 45 字符串过滤器
 46 '''
 47 p = soup.find(name='p')
 48 p_s = soup.find_all(name='p')
 49 print(p)
 50 print(p_s)
 51 
 52 #name+attrs
 53 p = soup.find(name='p',attrs={"id":"p"})
 54 print(p)
 55 
 56 #name+text
 57 p = soup.find(name='title',text="The Dormouse's story")
 58 print(p)
 59 
 60 #name+attrs+text
 61 tag = soup.find(name='a',attrs={"class":"sister"},text='Elsie')
 62 print(tag)
 63 
 64 
 65 '''
 66 -正则过滤器
 67 re模块匹配
 68 '''
 69 import re
 70 #name
 71 #根据re模块匹配带有a的节点
 72 a = soup.find(name=re.compile('a'))
 73 a_s = soup.find_all(name=re.compile('a'))
 74 print(a)
 75 print(a_s)
 76 
 77 #attrs
 78 a = soup.find(attrs={"id":re.compile('link')})
 79 print(a)
 80 
 81 
 82 #列表过滤器
 83 #列表内数据匹配
 84 print(soup.find(name=['a','p','html',re.compile('a')]))
 85 print(soup.find_all(name=['a','p','html',re.compile('a')]))
 86 
 87 # bool过滤器
 88 #True匹配
 89 print(soup.find(name=True,attrs={"id":True}))
 90 
 91 #方法过滤器
 92 #用于一些要的属性以及不需要的属性查找
 93 def have_id_not_class(tag):
 94     if tag.name == 'p' and tag.has_attr("id") and not tag.has_attr("class"):
 95         return tag
 96 print(soup.find_all(name=函数对象)
 97 print(soup.find_all(name=have_id_not_class))
 98 
 99 #补充说明:
100 #id
101 a = soup.find(id="link2")
102 print(a)
103 
104 #class
105 p = soup.find(class_='sister')
106 print(p)
View Code

 

转载于:https://www.cnblogs.com/tanknb/p/11129164.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值