手动输入一个影视名,在三米影视(对不起你了)爬取相关影视链接。
# 需求,自动打开三米影视,搜索加勒比海盗,将所有加勒比海盗电影链接进行爬取
import time
from lxml import etree
from selenium import webdriver
import aiohttp
import asyncio
inp = input("请输入电影名:")
url = 'https://www.smmy365.com'
# 1. 通过selenium模拟浏览器,打开界面并进行数据搜索,并获取返回数据(由于使用EDGE,未进行规避检测操作)
# 实例化与页面访问
bro = webdriver.Edge(executable_path='D:\系统\Driver_Notes\msedgedriver.exe')
bro.get(url)
# 数据解析,获取输入框和搜索按键并互动
time.sleep(1)
src_input = bro.find_element_by_class_name('search-text')
src_input.send_keys(inp)
time.sleep(0.5)
search_btn = bro.find_element_by_class_name('search-btn')
search_btn.click()
# 2. 对返回数据进行解析,获取电影名称和详情页URL
url_list = [] # 用于储存影视名和链接
# 获取页码
page_text = bro.page_source
tree = etree.HTML(page_text)
pageNum_text = tree.xpath('//div[@class = "page-btn uipages upage"]/ul[1]/span/text()')[0]
pageNum = pageNum_text.split('/')[-1].split('页')[0]
# 根据页码数循环,获取各页码下的数据
for i in range(int(pageNum)):
print('正在爬取第%d页数据'%(i+1))
page_text = bro.page_source
tree = etree.HTML(page_text)
ul_list = tree.xpath('//ul[@class = "serach-ul"]')
for ul in ul_list:
mov_info = {}
mov_url = 'https://www.smmy365.com'+ ul.xpath('./li/a/@href')[0]
mov_name = ul.xpath('./li/a/@title')[0]
# print(mov_name,mov_url)
mov_info['电影名'] = mov_name
mov_info['电影详情页'] = mov_url
url_list.append(mov_info)
try:
bro.find_element_by_xpath('//a[@class="next pagegbk"]').click() # 点击下一页
time.sleep(2)
bro.current_window_handle # 切换到当前网页
except:
pass
print(url_list)
# 3. 通过aiohttp进行详情页数据的爬取,获取所需数据
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
async def get_url(url,name):
async with aiohttp.ClientSession() as session:
async with await session.get(url,headers=headers) as response:
# film = {}
page_text = await response.text()
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@class="details-con2-list"]/li')
for li in li_list:
film_name = li.xpath('./a/@title')[0] + li.xpath('./a/text()')[0]
film_url = 'https://www.smmy365.com' + li.xpath('./a/@href')[0]
# film['film_name'] = film_name
# film['film_url'] = film_url
filePath = "./电影/三米影视_" + name + '.txt'
with open(filePath,'a',encoding='utf-8') as f:
f.write(film_name + ':' + film_url + '\n')
tasks = []
for url in url_list:
tasks.append(get_url(url['电影详情页'],url['电影名']))
asyncio.run(asyncio.wait(tasks))