python作业6月14日

# 爬虫原理 提取有用数据 保存
# 互联网
# 浏览器 发送http请求
# 梨视频:分析视频文件 保存到本地
# 今日内容
# request模块的详细使用
# selenium模块
# 1 请求库
# 2 解析库
# 3 存储库
import requests
import re
# response=requests.get(url='https://www.pearvideo.com/video_1566066')
# <a href="video_1461493" class="actplay openapp" target="_blank">
# <div class="video-main">
# <img class="img" src="https://image.pearvideo.com/cont/20181023/cont-1461493-11648067.png" alt="离婚后需要帮前妻还债吗?">
# <div class="vdo-time">02:35</div></div>
# <div class="vdo-summary">
# <p class="vdo-tt">离婚后需要帮前妻还债吗?</p>
# <div class="none vdo-open-tips"></div>
# </div>
# </a>
# re.findall('正则表达式','jiexiewenben','zhengzemoshi')
# print(response.status_code)
# print(response.text)
# res=re.findall('<a href="video_(.*?)"',response.text,re.S)
# # 。当前位置 *所有 ?找到就停下来
# print(res)
# for m_id in res:
# detail_url='https://www.pearvideo.com/video_'+m_id
# print(detail_url)
# 获取视频详情页
import uuid

 

# 1 发送请求
def get_page(url):
response=requests.get(url)
return response
# 2 解析数据

 

def parse_index(text):
res=re.findall('<a href="video_(.*?)"',text,re.S)
detail_url_list=[]
for m_id in res:
detail_url='https://www.pearvideo.com/video_'+m_id
detail_url_list.append(detail_url)
return detail_url_list
def parse_detail(text):
movie_url=re.findall('srcUrl="(.*?)"',text,re.S)[0]
return movie_url
# 3 保存数据
def save_movie(movie_url):
response=requests.get(movie_url)
with open(f'{uuid.uuid4()}.mp4','wb') as f:
f.write(response.content)
f.flush()
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import pool
if __name__ == "__main__":
# ThreadPoolExecutor
 

 

# index_res=get_page(url='https://www.pearvideo.com/')
# detail_url_list=parse_index(index_res.text)
 
# for detail_url in detail_url_list:
# detail_res =get_page(url=detail_url)
# movie_url=parse_detail(detail_res.text)
# print(movie_url)
# save_movie(movie_url)
 
# 导入线程池模块
url='https://www.pearvideo.com/'
pool.submit(get_page,url)

作业:

复制代码
''''''
'''
主页:
    https://movie.douban.com/top250
    GET
    User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
    
re正则:
    # 电影详情页url、图片链接、电影名称、电影评分、评价人数
    <div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价
'''
import requests
import re

url = 'https://movie.douban.com/top250'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
# 1、往豆瓣TOP250发送请求获取响应数据
response = requests.get(url, headers=headers)

# print(response.text)

# 2、通过正则解析提取数据
# 电影详情页url、图片链接、电影名称、电影评分、评价人数
movie_content_list = re.findall(
    # 正则规则
    '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<p class>==<p class>(.*?)</p>.*?<spam class="inq">(.*?)</span>',

    # 解析文本
    response.text,

    # 匹配模式
    re.S)
for movie_content in movie_content_list:
    # 解压赋值每一部电影
    detail_url, movie_jpg, name, point,yu,jili,num = movie_content
    data = f'电影名称:{name},   详情页url:{detail_url}, 图片url:{movie_jpg}, 评分: {point}, 评价人数: {num},介绍语:{yu}激励语;{jili} \n'
    #print(data)

    # 3、保存数据,把电影信息写入文件中
    with open('li.txt', 'a', encoding='utf-8') as f:
        f.write(data)
复制代码

转载于:https://www.cnblogs.com/jjjpython1/p/11061838.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值