1.豆瓣电影
'''
需求:获取所有分类下,所有电影信息(排名,电影名,演员,评分)
'''
import re
import requests
from lxml import etree
# 定义请求函数
def get_response(url):
response = requests.get(url=url,headers=headers)
print(response.text)
if flag ==0:
return response.text
return response.json()
# 定义获取type信息函数
def get_type(url):
res = get_response(url)
html = etree.HTML(res)
# 获取span标签内容
type_list = html.xpath('//div[@class="types"]/span/a')
# 获取herf标签之间的内容
for data in type_list:
#获取type
type_total = data.xpath('@href')
# print(type_total) ['/typerank?type_name=剧情&type=11&interval_id=100:90&action=']
#用正则获取type_id
type_id = type_pattern.findall(str(type_total))[0]
print(type_id)
movie_total(type_id)
#定义获取电影的总数
def movie_total(type_id):
global flag
flag = 1
# 发起请求 拼接id
response = get_response(total_url.format(type_id))
# print(response)
# 获取电影总数
total = response['total']
print(total)
# ...
# 定义所有电影信息函数
def movie(type_id):
...
if __name__ == '__main__':
flag=0
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
#定义一个基础URL
base_url = 'https://movie.douban.com/chart'
#定义获取所有分类的URL
total_url = 'https://movie.douban.com/typerank?type_name=%E5%96%9C%E5%89%A7&type={}&interval_id=100:90&action='
#定义获取所有电影信息的URL
movie_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=1'
type_pattern = re.compile('&type=(.*?)&')
get_type(base_url)
2.selenium的使用
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#调用无页面游览器
driver = webdriver.PhantomJS(executable_path=r'D:\pachong\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get(url='https://www.baidu.com/')
# 查看标题 title
# print(driver.title)
# 查看当前访问的url current_url
# print(driver.current_url)
# 查看获取所有的cookie get_cookies()
# print(driver.get_cookies())
# 查找元素
# 通过id查找元素 find_element_by_id('')
# id_kw = driver.find_element_by_id('kw')
# # print(id_kw)
# 通过class查找元素
# find_elements_by_class_name 返回列表对象
# find_element_by_class_name 返回元素对象
# s_ipt =driver.find_element_by_class_name('s_ipt')
# print(s_ipt) #<selenium.webdriver.remote.webelement.WebElement (session="8a146fb0-175b-11eb-8a1e-b7a9f72c5c1e", element=":wdc:1603696750938")>
# s_ipt =driver.find_elements_by_class_name('s_ipt')
# print(s_ipt) #[<selenium.webdriver.remote.webelement.WebElement (session="8a146fb0-175b-11eb-8a1e-b7a9f72c5c1e", element=":wdc:1603696750938")>]
# name属性查找元素 find_elements_by_name()
# name = driver.find_elements_by_name('wd')
# print(name)
# xpath 查找元素 find_elements_by_xpath()
# kw_xpath = driver.find_elements_by_xpath('//input[@id="kw"]')
# print(kw_xpath)
#text 获取标签之间的内容 find_element_by_xpath
# content = driver.find_element_by_xpath('//div[@id="s-top-left"]').text
# print(content)
#get_attribute('属性名') 获取对应属性值
# contet =driver.find_element_by_xpath('//input[@id="kw"]').get_attribute('class')
# print(contet)
# 截图 save_screenshot()
# driver.save_screenshot('baidu.jpg')
# 输入框输入内容 send_keys()
# driver.find_element_by_xpath('//input[@id="kw"]').send_keys('彭于晏')
# driver.save_screenshot('pyy.jpg')
# click() 点击
# driver.find_element_by_xpath('//input[@id="kw"]').send_keys('彭于晏')
# driver.find_element_by_id('su').click()
# time.sleep(2)
# driver.save_screenshot('pyy.jpg')
# 快捷键
# driver.find_element_by_xpath('//input[@id="kw"]').send_keys('彭于晏')
# # driver.save_screenshot('输入.png')
# # driver.find_element_by_xpath('//input[@id="kw"]').send_keys(Keys.CONTROL,'a')
# # driver.save_screenshot('全选.png')
# # driver.find_element_by_xpath('//input[@id="kw"]').send_keys(Keys.CONTROL,'x')
# # driver.save_screenshot('剪贴.png')
3.使用selenium调用谷歌游览器
# 导包
from selenium import webdriver
#导入配置
from selenium.webdriver.chrome.options import Options
# 1.实例化Options对象
options = Options()
# 2. add_argument() 添加配置
options.add_argument('--headless')
# options.add_argument('user-agent="Mozilla/5.0 (Linux; U; Android 8.0.0; zh-CN; MHA-AL00 Build/HUAWEIMHA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/12.1.4.994 Mobile Safari/537.36"')
# 调用游览器
# 需要下载驱动,需要根据自己谷歌浏览器的版本下载,保证前面大版本一致即可
driver = webdriver.Chrome(executable_path=r'C:\Users\Administrator\Downloads\chromedriver.exe',chrome_options=options)
#最大化游览器 :原因:有些内容需要最大化游览器才能显示
driver.maximize_window()
driver.get(url='https://www.baidu.com/')
4.豆瓣读书
# 导包
from selenium import webdriver
from lxml import etree
# 调用游览器
driver = webdriver.Chrome(executable_path=r'C:\Users\Administrator\Downloads\chromedriver.exe')
# 最大化游览器
# driver.maximize_window()
#发送请求
driver.get(url='https://search.douban.com/book/subject_search?search_text=python&cat=1001')
#数据提取
#将字符串转换html元素对象
html = etree.HTML(driver.page_source)
#获取div标签之间所有内容
content_list = html.xpath('//div[@class="item-root"]/div[@class="detail"]')
for content in content_list:
# 获取书名
name = content.xpath('./div[@class="title"]/a/text()')[0]
# 获取评分
score = content.xpath('//span[@class="rating_nums"]/text()')[0]
#获取评价人数
person = content.xpath('//span[@class="pl"]/text()')[0]
print(name,score,person)
5.使用selenium 获取斗鱼直播信息
# 导包
import time
from selenium import webdriver
from lxml import etree
from selenium.webdriver.common.keys import Keys
from openpyxl import Workbook
# 调用游览器
driver = webdriver.Chrome(executable_path=r'C:\Users\Administrator\Downloads\chromedriver.exe')
driver.get(url='https://www.douyu.com/g_hpjy')
# 最大化游览器
driver.maximize_window()
while True:
time.sleep(3)
html = etree.HTML(driver.page_source)
ul_list = html.xpath('//ul[@class="layout-Cover-list"]/li[@class="layout-Cover-item"]/div[@class="DyListCover HeaderCell is-href"]')
for ul in ul_list:
dic={}
# 获取主播名字
name = ul.xpath('.//h2/text()|.//h2/div/text()')[0]
# 获取标题名字
title = ul.xpath('.//h3[@class="DyListCover-intro"]/text()')[0]
# 获取人气
svg = ul.xpath('.//div[2]/span/text()')[0]
print(name, title, svg)
dic['主播']=name
dic['标题']=title
dic['人气']=svg
with open('斗鱼.txt','a',encoding='utf8') as fp:
fp.write(str(dic)+'\n')
flag = driver.find_element_by_xpath('//*[@id="listAll"]/div[2]/div/ul/li[last()]').get_attribute('aria-disabled')
if flag =='false':
driver.find_element_by_xpath('//*[@id="listAll"]/div[2]/div/ul/li[last()]/span').click()
else:
#退出游览器
driver.quit()
break