Python_day04

作业(爬豆瓣250)

''''''
'''
爬取豆瓣TOP250电影信息

主页:
    第一页:
        https://movie.douban.com/top250?start=0&filter=
    第二页:
        https://movie.douban.com/top250?start=25&filter=
    第三页:
        https://movie.douban.com/top250?start=50&filter=
    第四页:
        https://movie.douban.com/top250?start=75&filter=
    第十页:
        https://movie.douban.com/top250?start=225&filter=

    GET
    User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36

re正则:
    # 电影详情页url、图片链接、电影名称、导演、主演、电影上映时间、电影评分、评价人数、简介
   <div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?导演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<span class="inq">(.*?)</span>
'''
import requests
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
base_url = 'https://movie.douban.com/top250?start={}&filter='

n = 0
for line in range(10):
    url = base_url.format(n)
    print(type(n))
    n += 25
    print(url)

    # 1、往豆瓣TOP250发送请求获取响应数据
    response = requests.get(url, headers=headers)

    # print(response.text)

    # 2、通过正则解析提取数据
    # 电影详情页url、图片链接、电影名称、电影评分、评价人数
    movie_content_list = re.findall(
        # 正则规则
        # '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价',
        '<div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?导演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<span class="inq">(.*?)</span>',

        # 解析文本
        response.text,

        # 匹配模式
        re.S)

    for movie_content in movie_content_list:
        # 解压赋值每一部电影
        detail_url, movie_jpg, name, daoyan, timer, point, num, desc = movie_content
        data = f'电影名称:{name},   详情页url:{detail_url}, 图片url:{movie_jpg}, 导演: {daoyan} 上映时间: {timer}评分: {point}, 评价人数: {num} 简介:{desc}\n'
        print(data)

        # 3、保存数据,把电影信息写入文件中
        with open('douban.txt', 'a', encoding='utf-8') as f:
            f.write(data)

request模块详细使用

#requests详细使用

#对梨视频详细页发送请求,获取响应数据

'''
#爬取梨视频
请求url:
    Request URL: https://www.pearvideo.com/

请求方式:
    get

请求头:
    user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36

'''

import requests
import re #正则模块
#uuid.uuid4()根据时间戳随机生成一段字符串
import uuid


#爬虫三部曲
#1、发送请求
def get_page(url):
    response = requests.get(url)
    return response

#2、解析主页数据
#解析主页获取视频详细页ID
def parse_index(text):
    #re.findall('正则匹配规则','解析文本','正则模式')
    #re.S(全局模式 对整个文本进行匹配)
    #.指当前位置
    #*指查找所有
    res = re.findall('a href="video_(.*?)"',text,re.S)
    #print(res)

    detail_url_list = []
    for m_id in res:
        #拼接详情页url
        detail_url = 'https://www.pearvideo.com/video_'+m_id
        #print(detail_url)
        detail_url_list.append(detail_url)
        print(detail_url_list)

    return detail_url_list

#解析详情页获取视频url
def parse_detail(text):
    '''

    (.*?):提取括号的类容
    .*? :直接匹配
    '''

    movie_url = re.findall('srcUrl="(.*?)"',text,re.S)[0]
    return movie_url

#3、保存数据
def save_movie(movie_url):
    response = requests.get(movie_url)
    #把视频写入本地
    with open(f'{uuid.uuid4()}.mp4','wb')as f:
        f.write(response.content)
        f.flush()



if __name__=='__main__':
    #1、对主页发送请求
    index_res = get_page(url='https://www.pearvideo.com/')

    #2、对主页进行解析,获取详情页ID
    detail_url_list = parse_index(index_res.text)

    #3、对每个详情页url发送请求
    for detail_url in detail_url_list:
        detail_res = get_page(url=detail_url)

        #4、解析详情页获取视频url
        movie_url = parse_detail(detail_res.text)
        print(movie_url)

        #5、保存视频
        save_movie(movie_url)
#访问知乎发现

'''

Request URL: https://www.zhihu.com/explore


Request Method: GET

user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36

'''

#访问知乎
# import requests
# response = requests.get(url='https://www.zhihu.com/explore')
# print(response.status_code)
# print(response.text)



# #携带请求头参数访问知乎
#
# import requests
#
# #请求头字典
# headers = {
# 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
# }
#
# #在get请求内添加headers
# response = requests.get(url='https://www.zhihu.com/explore',headers = headers)
# print(response.status_code)
# # print(response.text)
#
# with open('zhihu.html','w',encoding='utf-8')as f:
# f.write(response.text)


'''

携带cookie
携带登录cookie破解博客园登录验证

请求url:
Request URL: https://home.cnblogs.com/set/


Request Method: GET

请求头
user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36

cookie:_ga=GA1.2.897792622.1560415926; _gid=GA1.2.2097222173.1560415926; __gads=ID=90a50b775a32932c:T=1560415924:S=ALNI_MbW1TkLegg86CkkukfahBA7y3wgnw; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrkt64Ro4vuztDN4I9mCKsOpo66KpTVpoyHTc3VgN3z62nZN14QzEMAPB1O5u7etKJDD6uU9zOhearKCZDYU7iDhDQucYSDobytq6uMDPmc3zFfgfMCs1UB5EdhPnpGGFWrZHCXZbLnLOsrdf98km6FgjfWqfVjKIzxzGq8NZTXmwXie-musLJnFRtnCqc5UsdseCokD6Ea1nMnI57Qa8V-rVLWfrzNhoMAwa6C68qe_I4wyRGRmSORbT2UE6JzNEtgkEftTqU0ZbhpBSw0K7rzwymnI8PYt6CmVq1GpKy6Xiz-cYxkcyUPCD7YUvWTy-E7O9C81fONFS50KTEtJgjHKcioDKlivHKKeKlj62Qso5ITIP9rGbRtE-aajWrkqap5Phifm9T96hnTjWf3heC0ihwOevA4Ywa8EmTky0xYz_6D53Kflvmp3peXI4g-67bo; .CNBlogsCookie=A19E10B88B12CA6DE89A7CDD8BD2178DC8114CDB3ECECB0F5174F7F0031898EEDC0DD2DD35656021DB87E62104F7C96CF6CECE773B09298826E467457E303F9B52A44B14CC964C0BDDEBF2F3578618F45F2CFD20
'''

import requests

#请求url
url = 'https://home.cnblogs.com/set/'

#请求头
# headers = {
# 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
#
# #在请求头中拼接cookie
# 'cookie':'_ga=GA1.2.897792622.1560415926; _gid=GA1.2.2097222173.1560415926; __gads=ID=90a50b775a32932c:T=1560415924:S=ALNI_MbW1TkLegg86CkkukfahBA7y3wgnw; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrkt64Ro4vuztDN4I9mCKsOpo66KpTVpoyHTc3VgN3z62nZN14QzEMAPB1O5u7etKJDD6uU9zOhearKCZDYU7iDhDQucYSDobytq6uMDPmc3zFfgfMCs1UB5EdhPnpGGFWrZHCXZbLnLOsrdf98km6FgjfWqfVjKIzxzGq8NZTXmwXie-musLJnFRtnCqc5UsdseCokD6Ea1nMnI57Qa8V-rVLWfrzNhoMAwa6C68qe_I4wyRGRmSORbT2UE6JzNEtgkEftTqU0ZbhpBSw0K7rzwymnI8PYt6CmVq1GpKy6Xiz-cYxkcyUPCD7YUvWTy-E7O9C81fONFS50KTEtJgjHKcioDKlivHKKeKlj62Qso5ITIP9rGbRtE-aajWrkqap5Phifm9T96hnTjWf3heC0ihwOevA4Ywa8EmTky0xYz_6D53Kflvmp3peXI4g-67bo; .CNBlogsCookie=A19E10B88B12CA6DE89A7CDD8BD2178DC8114CDB3ECECB0F5174F7F0031898EEDC0DD2DD35656021DB87E62104F7C96CF6CECE773B09298826E467457E303F9B52A44B14CC964C0BDDEBF2F3578618F45F2CFD20'
# }
# boke_res = requests.get(url,headers=headers)

# headers = {
# 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
#
#
# 'cookie':'_ga=GA1.2.897792622.1560415926; _gid=GA1.2.2097222173.1560415926; __gads=ID=90a50b775a32932c:T=1560415924:S=ALNI_MbW1TkLegg86CkkukfahBA7y3wgnw; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrkt64Ro4vuztDN4I9mCKsOpo66KpTVpoyHTc3VgN3z62nZN14QzEMAPB1O5u7etKJDD6uU9zOhearKCZDYU7iDhDQucYSDobytq6uMDPmc3zFfgfMCs1UB5EdhPnpGGFWrZHCXZbLnLOsrdf98km6FgjfWqfVjKIzxzGq8NZTXmwXie-musLJnFRtnCqc5UsdseCokD6Ea1nMnI57Qa8V-rVLWfrzNhoMAwa6C68qe_I4wyRGRmSORbT2UE6JzNEtgkEftTqU0ZbhpBSw0K7rzwymnI8PYt6CmVq1GpKy6Xiz-cYxkcyUPCD7YUvWTy-E7O9C81fONFS50KTEtJgjHKcioDKlivHKKeKlj62Qso5ITIP9rGbRtE-aajWrkqap5Phifm9T96hnTjWf3heC0ihwOevA4Ywa8EmTky0xYz_6D53Kflvmp3peXI4g-67bo; .CNBlogsCookie=A19E10B88B12CA6DE89A7CDD8BD2178DC8114CDB3ECECB0F5174F7F0031898EEDC0DD2DD35656021DB87E62104F7C96CF6CECE773B09298826E467457E303F9B52A44B14CC964C0BDDEBF2F3578618F45F2CFD20'
#
#
# print('1277886541@qq.com'in boke_res.text)



 

转载于:https://www.cnblogs.com/TuLie-cs/p/11040774.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值