from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
# 1.创建浏览器# 1)创建配置对象
options = ChromeOptions()# 2)添加配置# 取消图片加载,提高速度
options.add_argument('blink-settings=imagesEnabled=false')# 设置取消测试环境
options.add_experimental_option('excludeSwitches',['enable-automation'])# 2)通过配置对象创建浏览器对象
b = Chrome(options=options)# 2.打开网页
b.get('https://search.jd.com/Search?keyword=%E7%94%B5%E9%A5%AD%E9%94%85&enc=utf-8&wq=%E7%94%B5%E9%A5%AD%E9%94%85&pvid=388bf700598a4288b08f0bd375c4f93e')
time.sleep(1)for page inrange(10):# 3.滚动页面for _ inrange(10):
b.execute_script('window.scrollBy(0, 800)')
time.sleep(1)# 4.解析数据
soup = BeautifulSoup(b.page_source,'lxml')
goods_div = soup.select('#J_goodsList .gl-i-wrap')for div in goods_div:
name_a = div.select_one('.p-name a')
name = name_a.text
goods_url ='https:'+ name_a.attrs['href']
price = div.select_one('.p-price i').text.strip()
commit = div.select_one('.p-commit a').text.strip()
shop_a = div.select_one('.p-shop a')
shop_name = shop_a.text
shop_url ='https:'+ shop_a.attrs['href']print(name, goods_url, price, commit, shop_name, shop_url)print('---------------------1页数据解析完成----------------------')# 5. 翻页next= b.find_element(By.CLASS_NAME,'pn-next')next.click()
time.sleep(3)input('end:')
02jingdongDetails
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
options = ChromeOptions()# 不加载图片, 提升速度
options.add_experimental_option("prefs",{"profile.managed_default_content_settings.images":2})# 设置取消测试环境
options.add_experimental_option('excludeSwitches',['enable-automation'])
b = Chrome(options=options)
b.get('https://search.jd.com/Search?keyword=%E7%94%B5%E9%A5%AD%E9%94%85&enc=utf-8&wq=%E7%94%B5%E9%A5%AD%E9%94%85&pvid=388bf700598a4288b08f0bd375c4f93e')# 获取所有商品详情的超链接
all_goods_a = b.find_elements(By.CSS_SELECTOR,'#J_goodsList .gl-i-wrap>.p-img>a')for a in all_goods_a:
a.click()
time.sleep(1)# 切换到详情页
b.switch_to.window(b.window_handles[-1])
time.sleep(2)# 滚动详情页,然后点击'商品评价'
b.execute_script('window.scrollBy(0, 900)')
time.sleep(1)
comment = b.find_elements(By.CSS_SELECTOR,'#detail>.tab-main>ul>li')[-2]
comment.click()
time.sleep(2)print(b.page_source)print('================================获取完一个商品===============================')
b.close()
b.switch_to.window(b.window_handles[0])input('end:')
03 requestsLogin
import requests
# requests自动登录步骤# 第一步:人工对需要自动登录网页进行登录# 第二步:获取这个网站登录后的cookie信息# 第三步:发送请求的时候在请求头中添加cookie值
headers ={'cookie':'...','user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}
response = requests.get('https://www.zhihu.com/', headers=headers)print(response.text)
04 seleniumGetCookie
from selenium.webdriver import Chrome
# 1.创建浏览器打开需要自动登录的网页
b = Chrome()
b.get('https://www.taobao.com')# 2.留足够长的时候,人工完成登录(必须得保证b指向的窗口的网页中能看到登录以后的信息)input('是否已经完成登录:')# 3.获取登录成功后的cookie信息,保存到本地文件
result = b.get_cookies()withopen('files/taobao.txt','w', encoding='utf-8')as f:
f.write(str(result))
05 seleniumUseCookie
from selenium.webdriver import Chrome
# 1. 创建浏览器打开需要自动登录的网页
b = Chrome()
b.get('https://www.taobao.com')# 2.获取本地保存的cookiewithopen('files/taobao.txt', encoding='utf-8')as f:
result =eval(f.read())# 3.添加cookiefor x in result:
b.add_cookie(x)# 4.重新打开网页
b.get('https://www.taobao.com')input('end:')
06 requestsProxy
import requests
headers ={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}# 创建代理
proxies ={'https':'116.140.52.224:4513'}# 使用代理ip发送请求
res = requests.get('https://movie.douban.com/top250?start=0&filter=', headers=headers, proxies=proxies)print(res.text
07 seleniumProxy
from selenium.webdriver import Chrome, ChromeOptions
options = ChromeOptions()# 设置代理
options.add_argument('--proxy-server=http://180.127.245.154:4515')
b = Chrome(options=options)
b.get('https://movie.douban.com/top250?start=0&filter=')input()
08 xpath
import json
# xpath用来解析网页数据或者xml数据的一种解析方法,它是通过路径来获取标签(元素)。"""
Python数据:{'name': 'xiaoming', 'age': 18, 'is_ad': True, 'car_no': None}
Json数据:{"name": "xiaoming", "age": 18, "is_ad": true, "car_no": null}
xml数据:
<allStudent>
<student class="优秀学员">
<name>xiaoming</name>
<age>18</age>
<is_ad>是</is_ad>
<car_no></car_no>
</student>
<student class="优秀学员">
<name>xiaoming</name>
<age>18</age>
<is_ad>是</is_ad>
<car_no></car_no>
</student>
</allStudent>
"""# 1. 常见的几个概念"""
1)树:整个网页结构和xml结构就是一个树结构
2)元素(节点):html树结构的每个标签
3)根节点:树结构中的第一个节点
4)内容:标签内容
5)属性:标签属性
"""# 2. Xpath语法"""
1. 获取标签
1)绝对路径: 以'/'开头,然后从根节点开始层层往下写路径
2)相对路径: 写路径的时候用'.'或者'..'开头,其中'.'表示当前节点;'..'表示当前节点的父节点。
注意:如果路径以'./'开头,'./'可以省略
3)全路径: 以'//'开头的路径
2.获取标签内容:在获取标签的路径的最后加'/text()'
3.获取标签属性:在获取标签的路径的最后加'/@属性名'
"""# ==============================应用===================================from lxml import etree
# 1.创建树结构,获取根节点
html =open('data.html', encoding='utf-8').read()
root = etree.HTML(html)# 2.通过路径获取标签# 节点对象.xpath(路径) - 根据获取所有的标签,返回值是列表,列表中的元素是节点对象# 1)绝对路径
result = root.xpath('/html/body/div/a')print(result)# 获取标签内容
result = root.xpath('/html/body/div/a/text()')print(result)# 获取标签属性
result = root.xpath('/html/body/div/a/@href')print(result)# 1)绝对路径的写法跟xpath前面用谁去点的无关
div = root.xpath('/html/body/div')[0]
result = div.xpath('/html/body/div/a/text()')print(result)print('--------------------------------华丽的分割线-------------------------------------')# 2)相对路径
result = root.xpath('./body/div/a/text()')print(result)
result = div.xpath('./a/text()')print(result)
result = div.xpath('a/text()')print(result)# 3)全路径
result = root.xpath('//a/text()')print(result)
result = div.xpath('//a/text()')print(result)
result = root.xpath('//div/a/text()')print(result)print('--------------------------------华丽的分割线-------------------------------------')# 3.加谓语(加条件) - 路径中的节点[]# 1)位置相关谓语"""
[N] - 第N个指定标签(N从1开始)
[last()] - 最后一个指定标签
[last()-N]
[position()>N]、[position()>=N]、[position()<N]、[position()<=N]
"""
result = root.xpath('//span/p[2]/text()')print(result)
result = root.xpath('//span/p[last()]/text()')print(result)
result = root.xpath('//span/p[position()<=2]/text()')print(result)
result = root.xpath('//span/p[position()>2]/text()')print(result)
result = root.xpath('//span/p[last()-1]/text()')print(result)print('--------------------------------华丽的分割线-------------------------------------')# 2)属性相关谓语"""
[@属性名=属性值]
"""
result = root.xpath('//span/p[@id="p1"]/text()')print(result)
result = root.xpath('//span/p[@class="c1"]/text()')print(result)
result = root.xpath('//span/p[@data="5"]/text()')print(result)# 4.通配符# 在xpath中可以通过*来表示任意标签或者任意属性
result = root.xpath('//span/*/text()')print(result)
result = root.xpath('//span/p[@class="c1"]/text()')print(result)
result = root.xpath('//span/*[@class="c1"]/text()')print(result)
result = root.xpath('//span/span/@*')print(result)
result = root.xpath('//*[@class="c1"]/text()')print(result)
09 xpathDouBan
import requests
from lxml import etree
# 1.获取网页数据
headers ={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
proxies ={'https':'117.70.49.86:4531'}
response = requests.get('https://movie.douban.com/top250?start=125&filter=', headers=headers, proxies=proxies)# 2.解析数据
root = etree.HTML(response.text)
names = root.xpath('//div[@class="hd"]/a/span[1]/text()')
scores = root.xpath('//span[@class="rating_num"]/text()')
comments = root.xpath('//div[@class="star"]/span[last()]/text()')
msgs = root.xpath('//p[@class="quote"]/span/text()')print(names)print(scores)print(comments)print(msgs)